from datetime import datetime
date = datetime.now()
formatted_date = date.strftime("%B %d, %Y")
print(formatted_date)

March 04, 2026

# Check GPU memory
!nvidia-smi

Wed Mar  4 04:10:26 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             44W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

# Check system RAM
!free -h

               total        used        free      shared  buff/cache   available
Mem:            83Gi       1.3Gi        78Gi       2.0Mi       3.7Gi        81Gi
Swap:             0B          0B          0B

from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
userdata.get('HF_TOKEN')

# Set up the current working directory within the Google Drive
%cd /content/drive/My\ Drive/Colab\ Notebooks/LLM/sped_biblio/topic_modeling

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/LLM/sped_biblio/topic_modeling

!pip install --upgrade -q "numpy>=2.0" "pandas<2.3" openpyxl sentence-transformers bertopic scikit-learn matplotlib umap-learn hdbscan python-dotenv openai

import os
import warnings
warnings.filterwarnings('ignore', category=SyntaxWarning, module='hdbscan', message="invalid escape sequence '\\{'")

import re
from collections import defaultdict
import pickle
from pickle import UnpicklingError
from pickle import PicklingError
import itertools

# Data Manipulation
import numpy as np
import pandas as pd
import requests

# Natural Language Processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

# Generating Topic Labels
import openai
from dotenv import load_dotenv

# Clustering
from hdbscan import HDBSCAN
from umap import UMAP
from scipy.cluster import hierarchy as sch

# Visualization Imports
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.colors as pc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.ticker import FuncFormatter
import colorlover as cl
import textwrap

# Progress Bar
from tqdm import tqdm

# Display HTML
from IPython.display import IFrame

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_topic_labels(topic_info, topic_model, openai_model="gpt-4o", max_tokens=10, temperature=0.3):

    gen_names = []

    for topic_id in topic_info['Topic']:
        if topic_id == -1:
            gen_names.append("Outlier")
            continue

        keywords = topic_model.get_topic(topic_id)
        if keywords:
            top_keywords = ", ".join([keyword[0] for keyword in keywords[:10]])
            prompt = f"""
You are a highly skilled data scientist specializing in generating concise and descriptive topic labels based on provided top terms for each topic.
Each topic consists of a list of terms ordered from most to least significant.

Your objective is to create precise and concise labels that capture the essence of each topic by following these guidelines:

1. Use Person-First Language:
   - Prioritize respectful and inclusive language.
   - Avoid terms that may be considered offensive or stigmatizing.
   - For example, use "students with learning disabilities" instead of "disabled students".

2. Analyze the significance of the top terms:
   - Focus primarily on the most significant terms.
   - Include additional terms if they add essential context.

3. Synthesize the Topic Label:
   - Ensure clarity and conciseness (aim for 4-5 words).
   - Reflect the collective meaning of the most influential terms.
   - Use descriptive yet precise phrasing.

4. Maintain consistency:
   - Capitalize the first word using title case.
   - Use uniform formatting and avoid ambiguity.
   - Make consie and complete expressions.

Example
----------
Top 10 Keywords in [Representation]:
virtual manipulatives, manipulatives, mathematical, app, solving, learning disability, algebra, area, tool, concrete manipulatives

Generated Topic Label in [GenName]:
Visual-based technology for mathematical problem solving

Top 10 Keywords: {top_keywords}
Generated Topic Label in [GenName]:
"""
            response = openai.chat.completions.create(
                model=openai_model,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=temperature
            )
            gen_name = response.choices[0].message.content.strip()
            gen_name = re.sub(r'Generated Topic Label in \[GenName\]:', '', gen_name, flags=re.IGNORECASE).strip()
            gen_names.append(gen_name)
        else:
            gen_names.append("No Keywords")

    topic_info['GenName'] = gen_names
    return topic_info

all_data_file = f"files/all_data.xlsx"
all_data = pd.read_excel(all_data_file, na_filter=False)

df = all_data[all_data['filtered'] == 'Yes'].reset_index(drop=True)
df['Year'] = df['PY'].astype(int)
df['Decade'] = (df['Year'] // 10) * 10
df['CR'] = df['CR'].astype(str)

import logging
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
logging.getLogger("transformers").setLevel(logging.WARNING)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

publication_embeddings = sentence_model.encode(df['combined_text'].tolist(), show_progress_bar=True)
publication_embeddings_df = pd.DataFrame(publication_embeddings)
publication_embeddings_df['UT'] = df['UT'].tolist()

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

umap_model = UMAP(
    n_neighbors=20,
    n_components=2,
    min_dist=0.00,
    metric='cosine',
    random_state=42
)

reduced_embeddings = umap_model.fit_transform(publication_embeddings_df.drop(columns='UT'))
reduced_embeddings_df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])
reduced_embeddings_df['UT'] = publication_embeddings_df['UT'].tolist()

hdbscan_model = HDBSCAN(
    min_cluster_size=35,
    min_samples=25,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

hdbscan_model.fit(reduced_embeddings)
labels = hdbscan_model.labels_

vectorizer_model =  CountVectorizer(min_df=10)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()

topic_model = BERTopic(
  embedding_model=sentence_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,
  calculate_probabilities=True,
  verbose=True
)

topics, probs = topic_model.fit_transform(df['combined_text'])

2026-03-04 04:15:42,451 - BERTopic - Embedding - Transforming documents to embeddings.

Batches:   0%|          | 0/114 [00:00<?, ?it/s]

2026-03-04 04:15:45,827 - BERTopic - Embedding - Completed ✓
2026-03-04 04:15:45,828 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-03-04 04:16:10,753 - BERTopic - Dimensionality - Completed ✓
2026-03-04 04:16:10,754 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-03-04 04:16:11,031 - BERTopic - Cluster - Completed ✓
2026-03-04 04:16:11,035 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-03-04 04:16:12,160 - BERTopic - Representation - Completed ✓

topic_model.get_topic_info()

df["combined_text"] = (
    df["combined_text"]
    .str.replace(
        r"(?i)\baugmentative and alternative communication\b",
        "AAC",
        regex=True
    )
    .str.replace(
        r"(?i)\bapplied behavior analysis\b",
        "ABA",
        regex=True
    )
    .str.replace(
        r"(?i)\bautism spectrum disorder\b",
        "ASD",
        regex=True
    )
)

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
custom_stop_words = list(ENGLISH_STOP_WORDS - {"what"})

vectorizer_model = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1, 3), min_df=10)
topic_model.update_topics(df['combined_text'], vectorizer_model=vectorizer_model)

new_topics = topic_model.reduce_outliers(df['combined_text'], topics, strategy="c-tf-idf")

import logging
logging.getLogger("BERTopic").setLevel(logging.ERROR)
topic_model.update_topics(df['combined_text'], topics=new_topics, vectorizer_model=vectorizer_model)
logging.getLogger("BERTopic").setLevel(logging.WARNING)

topic_model.get_topic_info()

hierarchical_topics_init = topic_model.hierarchical_topics(df['combined_text'])

fig_hierarchical_init = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics_init)

fig_hierarchical_init.update_layout(
    width=1000,
    height=800,
    margin=dict(t=60, b=60, l=120, r=60),
    plot_bgcolor='white',
    paper_bgcolor='white',
    font_size=14,
    title="Hierarchical Topic Clustering"
)

fig_hierarchical_init.update_layout(width=None, height=None, autosize=True)
fig_hierarchical_init.write_html(
    "results/fig_hierarchical_init.html",
    config={"responsive": True}
)

# fig_hierarchical_init.show()

100%|██████████| 20/20 [00:00<00:00, 284.56it/s]

IFrame(src='results/fig_hierarchical_init.html', width=1000, height=800)

topics_to_merge = [[2, 7],
                  [6, 10],
                  [1, 13],
                  [5, 8],
                  [12, 19],
                  [15, 20]]
topic_model.merge_topics(df['combined_text'], topics_to_merge)

hierarchical_topics = topic_model.hierarchical_topics(df['combined_text'])

fig_hierarchical = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

fig_hierarchical.update_layout(
    width=1000,
    height=800,
    margin=dict(t=60, b=60, l=120, r=60),
    plot_bgcolor='white',
    paper_bgcolor='white',
    font_size=14,
    title="Hierarchical Topic Clustering"
)

fig_hierarchical.update_layout(width=None, height=None, autosize=True)
fig_hierarchical.write_html(
    "results/fig_hierarchical.html",
    config={"responsive": True}
)

# fig_hierarchical.show()

100%|██████████| 14/14 [00:00<00:00, 286.07it/s]

IFrame(src='results/fig_hierarchical.html', width=1000, height=800)

document_info = topic_model.get_document_info(df["combined_text"])
df["Topic"] = document_info["Topic"] + 1

topic_info = topic_model.get_topic_info()

topic_info_gen = generate_topic_labels(topic_info, topic_model)

topic_model.set_topic_labels(topic_info_gen['GenName'].tolist())

for i in range(probs.shape[1]):
    df[f'Prob_Topic_{i}'] = probs[:, i]

data = []
for topic in topic_info['Topic']:
    words_with_scores = topic_model.get_topic(topic)
    for word, score in words_with_scores:
        data.append({"Topic": topic, "Word": word, "Score": score})

topic_model_words = pd.DataFrame(data)

topic_info_words = topic_info.merge(topic_model_words, how="left", on="Topic") \
                                                      .sort_values(by=['Topic', 'Score'], ascending=[True, False])

topic_info['Topic'] = topic_info['Topic'] + 1
topic_info_words['Topic'] = topic_info_words['Topic'] + 1

topic_info_gen

topic_info.to_excel("files/topic_info.xlsx", engine='openpyxl', index=False)
topic_info_words.to_excel("files/topic_info_words.xlsx", engine='openpyxl', index=False)
df.to_excel("files/df.xlsx", engine='openpyxl', index=False)

# topic_info = pd.read_excel("files/topic_info.xlsx")
# topic_info_words = pd.read_excel("files/topic_info_words.xlsx")
# df = pd.read_excel("files/df.xlsx")
topic_info_human_loop = pd.read_excel("files/topic_info_human_loop.xlsx")

topic_info_human_loop

#topic_info = topic_info.drop(columns=['Topic'])
#topic_info_concat = pd.merge(topic_info, topic_info_human_loop[['Name', 'Topic', 'CustomLabel']], how="left", on="Name")
topic_info_concat = pd.merge(topic_info, topic_info_human_loop[['Name', 'CustomLabel']], how="left", on="Name")
order = ['Topic', 'Count', 'Name', 'Representation', 'GenName', 'CustomLabel']
topic_info_concat = topic_info_concat.reindex(columns=order)

#topic_info_words = topic_info_words.drop(columns=['Topic'])
#topic_info_words_concat = pd.merge(topic_info_words, topic_info_human_loop[['Name', 'Topic', 'CustomLabel']], how="left", on="Name")
topic_info_words_concat = pd.merge(topic_info_words, topic_info_human_loop[['Name', 'CustomLabel']], how="left", on="Name")
order = ['Topic', 'Count', 'Name', 'Representation', 'Word', 'Score', 'Representative_Docs', 'GenName', 'CustomLabel']
topic_info_words_concat = topic_info_words_concat.reindex(columns=order)

files = {
    "files/topic_model.pkl": topic_model,
    "files/publication_embeddings.pkl": publication_embeddings,
    "files/publication_embeddings_df.pkl": publication_embeddings_df,
    "files/reduced_embeddings.pkl": reduced_embeddings,
    "files/reduced_embeddings_df.pkl": reduced_embeddings_df,
    "files/topic_info.pkl": topic_info,
    "files/topic_info_concat.pkl": topic_info_concat,
    "files/topic_info_words.pkl": topic_info_words,
    "files/topic_info_words_concat.pkl": topic_info_words_concat,
    "files/df.pkl": df
}

for filename, data in files.items():
    if data is None:
        continue

    try:
        with open(filename, "wb") as f:
            pickle.dump(data, f)
    except Exception as e:
        print(f"Error saving pickle for {filename}: {e}")

    try:
        excel_filename = filename.rsplit('.', 1)[0] + ".xlsx"
        if isinstance(data, pd.DataFrame):
            data.to_excel(excel_filename, index=False)
        elif isinstance(data, np.ndarray) and data.ndim in [1, 2]:
            pd.DataFrame(data).to_excel(excel_filename, index=False)
    except Exception as e:
        print(f"Error saving Excel for {filename}: {e}")

#def load_data_from_files(files_dict):
#   loaded_data = {}
#   for filename, var_name in files_dict.items():
#       filepath = os.path.join('files', filename)
#       if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
#           try:
#               with open(filepath, "rb") as f:
#                   loaded_data[var_name] = pickle.load(f)
#           except Exception as e:
#               loaded_data[var_name] = None
#               print(f"Error loading {filepath}: {e}")
#       else:
#           loaded_data[var_name] = None
#   return loaded_data

#files_to_load = {
#   "topic_model.pkl": "topic_model",
#   "publication_embeddings.pkl": "publication_embeddings",
#   "publication_embeddings_df.pkl": "publication_embeddings_df",
#   "reduced_embeddings.pkl": "reduced_embeddings",
#   "reduced_embeddings_df.pkl": "reduced_embeddings_df",
#   "topic_info.pkl": "topic_info",
#   "topic_info_concat.pkl": "topic_info_concat",
#   "topic_info_words.pkl": "topic_info_words",
#   "topic_info_words_concat.pkl": "topic_info_words_concat",
#   "df.pkl": "df"
#}

#loaded_data = load_data_from_files(files_to_load)
#topic_model = loaded_data.get("topic_model")
#publication_embeddings = loaded_data.get("publication_embeddings")
#publication_embeddings_df = loaded_data.get("publication_embeddings_df")
#reduced_embeddings = loaded_data.get("reduced_embeddings")
#reduced_embeddings_df = loaded_data.get("reduced_embeddings_df")
#topic_info = loaded_data.get("topic_info")
#topic_info_concat = loaded_data.get("topic_info_concat")
#topic_info_words = loaded_data.get("topic_info_words")
#topic_info_words_concat = loaded_data.get("topic_info_words_concat")
#df = loaded_data.get("df")

topic_word_data = topic_info_words_concat.copy()

topic_word_data['Word (c-TF-IDF)'] = topic_word_data.apply(
    lambda row: f"{row['Word']} ({row['Score']:.3f})", axis=1
)

topic_word_data = topic_word_data.sort_values(by=['CustomLabel', 'Score'], ascending=[True, False])

topic_word_table = topic_word_data.groupby('CustomLabel', sort=False).agg(
    Topic=('Topic', 'first'),
    Count=('Count', 'first'),
    **{'Word (c-TF-IDF)': ('Word (c-TF-IDF)', lambda x: ', '.join(x))}
).reset_index()[['Topic', 'CustomLabel', 'Count', 'Word (c-TF-IDF)']]

topic_word_table = topic_word_table.sort_values(by='Topic')

styles = [
    dict(selector="caption", props=[("caption-side", "top")]),
    {'selector': 'th', 'props': [('text-align', 'center')]},
    {'selector': 'td', 'props': [('text-align', 'left')]}
]

topic_word_table = (
    topic_word_table.style
    .set_caption("<b>Topic Word Score</b>")
    .set_table_styles(styles)
    .set_properties(**{
        'font-size': '24px',
        'font-family': 'Helvetica Neue',
        'color': 'black',
        'text-align': 'center'
    })
    .set_properties(**{
        'text-align': 'center',
        'font-size': '14px',
        'font-family': 'Helvetica Neue',
        'color': 'black',
    }, subset=pd.IndexSlice[:, topic_word_table.columns])
    .hide(axis='index')
)

topic_word_table

color_map = [
    '#EF5350',
    '#42A5F5',
    '#66BB6A',
    '#FFA726',
    '#AB47BC',
    '#26A69A',
    '#FFEE58',
    '#FF8A65',
    '#EC407A',
    '#90A4AE',
    '#C0CA33',
    '#4DD0E1',
    '#A1887F',
    '#CE93D8',
    '#7986CB'
]

docs_topics_data = pd.merge(df, topic_info_concat[['Topic', 'CustomLabel']], on='Topic', how='left')
docs_topics_data['Topic'] = pd.to_numeric(docs_topics_data['Topic'], errors='coerce')
docs_topics_data = pd.merge(docs_topics_data, reduced_embeddings_df[['UT', 'x', 'y']], on='UT', how='left')
docs_topics_data['x'] = pd.to_numeric(docs_topics_data['x'], errors='coerce')
docs_topics_data['y'] = pd.to_numeric(docs_topics_data['y'], errors='coerce')
docs_topics_data = docs_topics_data.dropna(subset=['Topic', 'x', 'y'])
#docs_topics_data = df.copy()
docs_topics_data['Topic'] = docs_topics_data['Topic'].astype(int)

ordered_legend = docs_topics_data.sort_values('Topic')['CustomLabel'].unique().tolist()
ordered_legend = [x for x in ordered_legend if pd.notnull(x)]
docs_topics_data['legend_topic'] = pd.Categorical(docs_topics_data['CustomLabel'],
                                                    categories=ordered_legend,
                                                    ordered=True)

docs_topics_data['hover_text'] = (
    "Title: " +
    docs_topics_data['TI'].apply(lambda text: "<br>".join(textwrap.wrap(text, width=50))) +
    "<br><br>Topic " + docs_topics_data['Topic'].astype(str) + ": " + docs_topics_data['CustomLabel']
)

color_mapping = {cat: color_map[i % len(color_map)] for i, cat in enumerate(ordered_legend)}

fig_cluster = px.scatter(
    docs_topics_data,
    x='x',
    y='y',
    color='legend_topic',
    color_discrete_map=color_mapping,
    labels={'x': 'X', 'y': 'Y'},
    custom_data=['hover_text'],
    category_orders={'legend_topic': ordered_legend}
)

fig_cluster.update_traces(
    marker=dict(size=10, opacity=0.9, line=dict(width=1, color="white")),
    selector=dict(mode='markers'),
    hovertemplate='<b>%{customdata[0]}</b><extra></extra>'
)

fig_cluster.update_traces(
    hoverlabel=dict(font_size=16)
)

fig_cluster.update_layout(
    title="<b>Semantic Space of Documents by Topic</b>",
    title_x=0.5,
    title_font=dict(size=24, family="Helvetica Neue", color="black"),
    margin=dict(t=80, b=80, l=80, r=80),
    plot_bgcolor='white',
    paper_bgcolor='white',
    width=900,
    height=800,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.15,
        xanchor="center",
        x=0.5,
        title_text="",
        font=dict(size=14),
        title_font=dict(size=16)
    )
)

fig_cluster.update_layout(width=None, height=None, autosize=True)
fig_cluster.write_html(
    "results/fig_cluster.html",
    config={"responsive": True}
)
# fig_cluster.show()

IFrame(src='results/fig_cluster.html', width=900, height=1000)

merged_df = pd.merge(df, publication_embeddings_df, on='UT')

embeddings = [col for col in publication_embeddings_df.columns if col != 'UT']
topic_embeddings = merged_df.groupby("Topic")[embeddings].mean()

cosine_sim_matrix = cosine_similarity(topic_embeddings)

cosine_sim_df = pd.DataFrame(
    cosine_sim_matrix,
    index=topic_embeddings.index,
    columns=topic_embeddings.index
)

topic_to_label = topic_info_concat.set_index('Topic')['CustomLabel'].to_dict()

hover_text = []
for i, row_topic in enumerate(cosine_sim_df.index):
    row_hover = []
    for j, col_topic in enumerate(cosine_sim_df.columns):
        row_label = topic_to_label.get(row_topic, f"Topic {row_topic}")
        col_label = topic_to_label.get(col_topic, f"Topic {col_topic}")
        similarity = cosine_sim_df.iloc[i, j]
        hover = f'Topic {row_topic}: {row_label}<br>Topic {col_topic}: {col_label}<br>Cosine Similarity: {similarity:.3f}'
        row_hover.append(hover)
    hover_text.append(row_hover)

fig_cosine_sim = px.imshow(
    cosine_sim_df,
    color_continuous_scale="YlGnBu",
    origin="lower",
    labels=dict(color="Similarity"),
    x=cosine_sim_df.columns,
    y=cosine_sim_df.index,
    text_auto=True
)

fig_cosine_sim.update_xaxes(
    tickmode='array',
    tickvals=cosine_sim_df.columns,
    ticktext=[str(topic) for topic in cosine_sim_df.columns],
    tickfont=dict(size=13)
)

fig_cosine_sim.update_yaxes(
    tickmode='array',
    tickvals=cosine_sim_df.index,
    ticktext=[str(topic) for topic in cosine_sim_df.index],
    tickfont=dict(size=13)
)

fig_cosine_sim.update_traces(
    texttemplate='%{z:.2f}',
    textfont=dict(size=13),
    customdata=hover_text,
    hovertemplate='%{customdata}<extra></extra>',
    hoverlabel=dict(font_size=13)
)

fig_cosine_sim.update_layout(
    title="<b>Cosine Similarity Among Topics</b>",
    title_x=0.5,
    title_font=dict(size=24, family="Helvetica Neue", color="black"),
    margin=dict(t=80, b=80, l=80, r=80),
    plot_bgcolor='white',
    paper_bgcolor='white',
    width=900,
    height=900,
    showlegend=True,
    legend_title_text="Topic",
    legend_title_font=dict(size=13),
    legend_font=dict(size=13),
    font=dict(size=13)
)

x_dom = fig_cosine_sim.layout.xaxis.domain
x_center = (x_dom[0] + x_dom[1]) / 2
x_len = 1 * (x_dom[1] - x_dom[0])

fig_cosine_sim.update_layout(
    coloraxis_colorbar=dict(
        orientation='h',
        x=x_center,
        xanchor='center',
        y=-0.13,
        yanchor='top',
        len=x_len,
        thickness=12
    )
)

fig_cosine_sim.update_layout(width=None, height=None, autosize=True)
fig_cosine_sim.write_html(
    "results/fig_cosine_sim.html",
    config={"responsive": True}
)
# fig_cosine_sim.show()

IFrame(src='results/fig_cosine_sim.html', width=900, height=900)

vectorizer = vectorizer_model.build_analyzer()

df = docs_topics_data.copy()

def extract_ngrams(text):
    all_ngrams = vectorizer(text)
    unigrams = [ng for ng in all_ngrams if len(ng.split()) == 1]
    bigrams  = [ng for ng in all_ngrams if len(ng.split()) == 2]
    trigrams = [ng for ng in all_ngrams if len(ng.split()) == 3]
    return unigrams, bigrams, trigrams

df['unigrams'], df['bigrams'], df['trigrams'] = zip(*df['combined_text'].apply(extract_ngrams))

with open("files/df.pkl", "wb") as f_df:
    pickle.dump(df, f_df)

df.to_excel("files/df.xlsx", engine='openpyxl', index=False)
topic_word_table.to_excel("files/topic_word_table.xlsx", engine='openpyxl', index=False)
cosine_sim_df.to_excel("files/cosine_sim_df.xlsx", engine='openpyxl', index=False)

# with open("files/df.pkl", "rb") as f_df:
#     df = pickle.load(f_df)

# df = pd.read_excel("files/df.xlsx")
# topic_word_table = pd.read_excel("files/topic_word_table.xlsx")
# cosine_sim_df = pd.read_excel("files/cosine_sim_df.xlsx")

from nbconvert import HTMLExporter
import nbformat

notebook_path = 'index.ipynb'
html_exporter = HTMLExporter()

with open(notebook_path, 'r', encoding='utf-8') as nb_file:
    notebook_content = nb_file.read()
    notebook = nbformat.reads(notebook_content, as_version=4)

if 'widgets' in notebook.metadata and 'application/vnd.jupyter.widget-state+json' in notebook.metadata['widgets']:
    if 'state' not in notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']:
        notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']['state'] = {}

html_output, _ = html_exporter.from_notebook_node(notebook)

with open('index.html', 'w', encoding='utf-8') as html_file:
    html_file.write(html_output)

	Topic	Count	Name	Representation	Representative_Docs
0	-1	1054	-1_interventions_assessment_behavioral_behaviors	[interventions, assessment, behavioral, behavi...	[ONLINE INDIRECT GROUP TREATMENT FOR PRESCHOOL...
1	0	391	0_autism_research_interventions_studies	[autism, research, interventions, studies, the...	[EVALUATION OF A TELEHEALTH ABA PROGRAM FOR CA...
2	1	250	1_analyses_assessment_assessments_behaviors	[analyses, assessment, assessments, behaviors,...	[FUNCTIONAL ANALYSIS OF ABERRANT BEHAVIOR MAIN...
3	2	214	2_monitoring_behavioral_design_activity	[monitoring, behavioral, design, activity, phy...	[A MOBILE SELF-CONTROL TRAINING APP TO IMPROVE...
4	3	195	3_research_studies_study_interventions	[research, studies, study, interventions, desi...	[A MOTHER'S USE OF READING FLUENCY STRATEGIES:...
5	4	191	4_interventions_studies_study_research	[interventions, studies, study, research, effe...	[EFFECTS OF AN IPAD-BASED SPEECH-GENERATING DE...
6	5	170	5_interventions_effectiveness_study_behaviors	[interventions, effectiveness, study, behavior...	[COMPARISON OF LIVE MODELING AND VIDEO MODELIN...
7	6	157	6_analyses_assessing_software_research	[analyses, assessing, software, research, visu...	[ANALYSIS OF RISK OF BIAS ASSESSMENTS IN A SAM...
8	7	132	7_intervention_design_rehabilitation_therapy	[intervention, design, rehabilitation, therapy...	[FORCED USE OF THE UPPER EXTREMITY IN CEREBRAL...
9	8	114	8_disabilities_effectiveness_study_disability	[disabilities, effectiveness, study, disabilit...	[COMPARING THE EFFECTIVENESS OF TWO VIDEO FADI...
10	9	112	9_interventions_intervention_disabilities_skills	[interventions, intervention, disabilities, sk...	[VIDEO MODELING AND EXPLICIT INSTRUCTION: A CO...
11	10	98	10_experiences_methodology_research_study	[experiences, methodology, research, study, ex...	[LEADING CHANGE: A MULTIPLE-CASE STUDY OF LEAD...
12	11	80	11_interventions_behavioral_behaviors_behavior	[interventions, behavioral, behaviors, behavio...	[THE EFFECTS OF STUDENT- AND TEACHER-LED TOOTL...
13	12	61	12_rehabilitation_study_procedures_studies	[rehabilitation, study, procedures, studies, p...	[USING A DIGITAL SPELLING AID TO IMPROVE WRITI...
14	13	59	13_behavioral_methodology_development_research	[behavioral, methodology, development, researc...	[TRANSLATING BEHAVIOR ANALYSIS: A SPECTRUM RAT...
15	14	58	14_autism_interventions_intervention_behaviors	[autism, interventions, intervention, behavior...	[EXPANDING COMMUNICATION MODALITIES AND FUNCTI...
16	15	56	15_analyses_study_instructional_effectiveness	[analyses, study, instructional, effectiveness...	[EVALUATING THE USE OF VIDEO MODELING WITH VOI...
17	16	54	16_study_impairments_impairment_disabilities	[study, impairments, impairment, disabilities,...	[ENABLING PEOPLE WITH INTELLECTUAL AND SENSORY...
18	17	53	17_usability_effectiveness_efficacy_impairments	[usability, effectiveness, efficacy, impairmen...	[TEXT MESSAGES REDUCE MEMORY FAILURES IN ADULT...
19	18	51	18_psychology_stimulus_tests_treatments	[psychology, stimulus, tests, treatments, test...	[TEACHING STATISTICAL VARIABILITY WITH EQUIVAL...
20	19	45	19_stimulation_efficacy_questionnaire_visual	[stimulation, efficacy, questionnaire, visual,...	[EVALUATION OF THE OPTIMIZED PITCH AND LANGUAG...
21	20	42	20_analyses_interventions_behavioral_skills	[analyses, interventions, behavioral, skills, ...	[BEHAVIORAL INTERVENTIONS TO IMPROVE PERFORMAN...

	Topic	Count	Name	Representation	Representative_Docs
0	0	476	0_aba_parent_children_parents	[aba, parent, children, parents, telehealth, t...	[EVALUATION OF A TELEHEALTH ABA PROGRAM FOR CA...
1	1	305	1_reinforcement_behavior_response_functional	[reinforcement, behavior, response, functional...	[FUNCTIONAL ANALYSIS OF ABERRANT BEHAVIOR MAIN...
2	2	308	2_treatment_health_intervention_therapy	[treatment, health, intervention, therapy, stu...	[A MOBILE SELF-CONTROL TRAINING APP TO IMPROVE...
3	3	211	3_reading_students_words_writing	[reading, students, words, writing, vocabulary...	[A MOTHER'S USE OF READING FLUENCY STRATEGIES:...
4	4	245	4_aac_communication_speech_children	[aac, communication, speech, children, asd, au...	[EFFECTS OF AN IPAD-BASED SPEECH-GENERATING DE...
5	5	283	5_video_modeling_video modeling_asd	[video, modeling, video modeling, asd, skills,...	[COMPARISON OF LIVE MODELING AND VIDEO MODELIN...
6	6	198	6_data_case_single_single case	[data, case, single, single case, analysis, de...	[ANALYSIS OF RISK OF BIAS ASSESSMENTS IN A SAM...
7	7	192	7_motor_subject_single_design	[motor, subject, single, design, rehabilitatio...	[FORCED USE OF THE UPPER EXTREMITY IN CEREBRAL...
8	8	207	8_intellectual_disabilities_students_video	[intellectual, disabilities, students, video, ...	[COMPARING THE EFFECTIVENESS OF TWO VIDEO FADI...
9	9	132	9_students_mathematics_solving_virtual	[students, mathematics, solving, virtual, lear...	[VIDEO MODELING AND EXPLICIT INSTRUCTION: A CO...
10	10	141	10_design_learning_research_education	[design, learning, research, education, study,...	[LEADING CHANGE: A MULTIPLE-CASE STUDY OF LEAD...
11	11	151	11_behavior_classroom_game_disruptive	[behavior, classroom, game, disruptive, studen...	[THE EFFECTS OF STUDENT- AND TEACHER-LED TOOTL...
12	12	82	12_treatment_production_language_speech	[treatment, production, language, speech, ther...	[USING A DIGITAL SPELLING AID TO IMPROVE WRITI...
13	13	89	13_behavior_behavior analysis_analysis_aba	[behavior, behavior analysis, analysis, aba, b...	[TRANSLATING BEHAVIOR ANALYSIS: A SPECTRUM RAT...
14	14	124	14_social_peer_peers_asd	[social, peer, peers, asd, children, skills, i...	[EXPANDING COMMUNICATION MODALITIES AND FUNCTI...
15	15	152	15_training_staff_feedback_skills	[training, staff, feedback, skills, video, ski...	[EVALUATING THE USE OF VIDEO MODELING WITH VOI...
16	16	80	16_technology_participants_disabilities_intell...	[technology, participants, disabilities, intel...	[ENABLING PEOPLE WITH INTELLECTUAL AND SENSORY...
17	17	67	17_memory_brain_brain injury_injury	[memory, brain, brain injury, injury, cognitiv...	[TEXT MESSAGES REDUCE MEMORY FAILURES IN ADULT...
18	18	77	18_relations_instruction_stimulus_students	[relations, instruction, stimulus, students, c...	[TEACHING STATISTICAL VARIABILITY WITH EQUIVAL...
19	19	51	19_hearing_speech_recognition_patients	[hearing, speech, recognition, patients, subje...	[EVALUATION OF THE OPTIMIZED PITCH AND LANGUAG...
20	20	66	20_performance_matching_feedback_behavioral	[performance, matching, feedback, behavioral, ...	[BEHAVIORAL INTERVENTIONS TO IMPROVE PERFORMAN...

	Topic	Count	Name	Representation	Representative_Docs	GenName
0	1	500	0_study_intervention_treatment_design	[study, intervention, treatment, design, singl...	[EFFECTIVENESS OF TRAUMA-FOCUSED ART THERAPY (...	Single-Case Health Intervention Study
1	2	490	1_video_skills_modeling_video modeling	[video, skills, modeling, video modeling, inte...	[EFFECTS OF VIDEO MODELING WITH VIDEO FEEDBACK...	Video Modeling for Social Skills in Children w...
2	3	476	2_aba_children_parent_training	[aba, children, parent, training, parents, tel...	[TELEHEALTH AND AUTISM: TREATING CHALLENGING B...	Telehealth Interventions for Children with Autism
3	4	394	3_behavior_reinforcement_response_analysis	[behavior, reinforcement, response, analysis, ...	[THE EFFECT OF RULES ON DIFFERENTIAL REINFORCE...	Behavioral Analysis and Intervention Strategies
4	5	339	4_data_design_case_single	[data, design, case, single, analysis, single ...	[WHEN THE TRUTH HITS YOU BETWEEN THE EYES A SO...	Single-Case Visual Data Analysis
5	6	245	5_aac_communication_speech_children	[aac, communication, speech, children, asd, au...	[TRANSITIONING FROM A LOW- TO HIGH-TECH AAC (A...	Augmentative Communication for Children with A...
6	7	218	6_training_feedback_staff_performance	[training, feedback, staff, performance, skill...	[THE USE OF BEHAVIORAL SKILLS TRAINING TO TEAC...	Staff Training and Performance Enhancement
7	8	211	7_reading_students_words_instruction	[reading, students, words, instruction, vocabu...	[TECHNOLOGY-ASSISTED READING FLUENCY INTERVENT...	Reading Instruction and Vocabulary Development
8	9	151	8_behavior_classroom_game_disruptive	[behavior, classroom, game, disruptive, studen...	[EVALUATION OF THE GOOD BEHAVIOR GAME USING CL...	Classroom Behavior and Management Strategies
9	10	133	9_treatment_hearing_speech_study	[treatment, hearing, speech, study, single, re...	[USING TREATMENT TO IMPROVE THE PRODUCTION OF ...	Speech and Hearing Therapy Research
10	11	132	10_students_learning_solving_virtual	[students, learning, solving, virtual, instruc...	[USING A VIRTUAL NUMBER LINE AND CORRECTIVE FE...	Instructional Strategies for Students with Dis...
11	12	124	11_social_peer_asd_peers	[social, peer, asd, peers, children, skills, i...	[INTERACTION AMONG PRESCHOOLERS WITH AND WITHO...	Social Interaction Skills for Children with ASD
12	13	80	12_technology_participants_disabilities_multiple	[technology, participants, disabilities, multi...	[A SMARTPHONE-BASED PROGRAM ENABLING PEOPLE WI...	Smartphone Technology for People with Intellec...
13	14	77	13_instruction_relations_stimulus_students	[instruction, relations, stimulus, students, c...	[USING STIMULUS EQUIVALENCE-BASED INSTRUCTION ...	Programmed Instruction for College Students
14	15	67	14_brain_injury_cognitive_rehabilitation	[brain, injury, cognitive, rehabilitation, int...	[EFFICACY OF ELECTRONIC PORTABLE ASSISTIVE DEV...	Cognitive Rehabilitation for Brain Injury

	Topic	Count	Name	Representation	Representative_Docs	GenName	CustomLabel
0	1	500	0_study_intervention_treatment_design	['study', 'intervention', 'treatment', 'design...	['EFFECTIVENESS OF TRAUMA-FOCUSED ART THERAPY ...	Single-Case Health Intervention Study	Health Interventions
1	2	490	1_video_skills_modeling_video modeling	['video', 'skills', 'modeling', 'video modelin...	["EFFECTS OF VIDEO MODELING WITH VIDEO FEEDBAC...	Video Modeling for Social Skills in Children w...	Video modeling
2	3	476	2_aba_children_parent_training	['aba', 'children', 'parent', 'training', 'par...	['TELEHEALTH AND AUTISM: TREATING CHALLENGING ...	Telehealth Interventions for Children with Autism	Telehealth services
3	4	394	3_behavior_reinforcement_response_analysis	['behavior', 'reinforcement', 'response', 'ana...	['THE EFFECT OF RULES ON DIFFERENTIAL REINFORC...	Behavioral Analysis and Intervention Strategies	Function-based interventions
4	5	339	4_data_design_case_single	['data', 'design', 'case', 'single', 'analysis...	['WHEN THE TRUTH HITS YOU BETWEEN THE EYES A S...	Visual Analysis in Single-Case Research	Single-case design analysis
5	6	245	5_aac_communication_speech_children	['aac', 'communication', 'speech', 'children',...	["TRANSITIONING FROM A LOW- TO HIGH-TECH AAC (...	Augmentative Communication for Children with A...	AAC interventions
6	7	218	6_training_feedback_staff_performance	['training', 'feedback', 'staff', 'performance...	["THE USE OF BEHAVIORAL SKILLS TRAINING TO TEA...	Staff Training and Performance Enhancement	Staff training
7	8	211	7_reading_students_words_instruction	['reading', 'students', 'words', 'instruction'...	['TECHNOLOGY-ASSISTED READING FLUENCY INTERVEN...	Reading Instruction and Vocabulary Development	Reading and writing instruction
8	9	151	8_behavior_classroom_game_disruptive	['behavior', 'classroom', 'game', 'disruptive'...	["EVALUATION OF THE GOOD BEHAVIOR GAME USING C...	Classroom Behavior and Management Strategies	Classroom management
9	10	133	9_treatment_hearing_speech_study	['treatment', 'hearing', 'speech', 'study', 's...	['USING TREATMENT TO IMPROVE THE PRODUCTION OF...	Speech and Hearing Therapy Research	Speech and language therapy
10	11	132	10_students_learning_solving_virtual	['students', 'learning', 'solving', 'virtual',...	['USING A VIRTUAL NUMBER LINE AND CORRECTIVE F...	Instructional Strategies for Students with Dis...	Mathematics instruction
11	12	124	11_social_peer_asd_peers	['social', 'peer', 'asd', 'peers', 'children',...	["INTERACTION AMONG PRESCHOOLERS WITH AND WITH...	Social Interaction Skills for Children with ASD	Social skills interventions
12	13	80	12_technology_participants_disabilities_multiple	['technology', 'participants', 'disabilities',...	["A SMARTPHONE-BASED PROGRAM ENABLING PEOPLE W...	Smartphone Use by People with Intellectual Dis...	Assistive technology
13	14	77	13_instruction_relations_stimulus_students	['instruction', 'relations', 'stimulus', 'stud...	['USING STIMULUS EQUIVALENCE-BASED INSTRUCTION...	Programmed Instruction for College Students	Computer-based instruction
14	15	67	14_brain_injury_cognitive_rehabilitation	['brain', 'injury', 'cognitive', 'rehabilitati...	['EFFICACY OF ELECTRONIC PORTABLE ASSISTIVE DE...	Cognitive Rehabilitation for Brain Injury	Cognitive rehabilitation

Topic	CustomLabel	Count	Word (c-TF-IDF)
1	Health Interventions	500	study (0.028), intervention (0.027), treatment (0.026), design (0.026), single (0.025), therapy (0.025), participants (0.023), baseline (0.022), case (0.021), health (0.019)
2	Video modeling	490	video (0.063), skills (0.045), modeling (0.041), video modeling (0.035), intellectual (0.030), asd (0.029), children (0.028), autism (0.028), students (0.027), social (0.026)
3	Telehealth services	476	aba (0.053), children (0.034), parent (0.033), training (0.032), parents (0.032), telehealth (0.031), asd (0.028), intervention (0.027), autism (0.027), behavior (0.023)
4	Function-based interventions	394	behavior (0.077), reinforcement (0.077), response (0.038), analysis (0.038), functional (0.033), functional analysis (0.031), automatic (0.029), treatment (0.029), problem (0.027), problem behavior (0.027)
5	Single-case design analysis	339	data (0.049), design (0.048), case (0.045), single (0.040), analysis (0.038), single case (0.037), research (0.034), learning (0.026), designs (0.025), visual (0.024)
6	AAC interventions	245	aac (0.073), communication (0.070), speech (0.045), children (0.037), asd (0.033), autism (0.027), intervention (0.027), picture (0.024), participants (0.022), studies (0.022)
7	Staff training	218	training (0.080), feedback (0.053), staff (0.046), performance (0.042), skills (0.035), behavioral (0.032), video (0.032), skills training (0.028), participants (0.025), teaching (0.021)
8	Reading and writing instruction	211	reading (0.100), students (0.053), words (0.037), instruction (0.032), vocabulary (0.032), word (0.032), intervention (0.028), learning (0.027), fluency (0.025), study (0.024)
9	Classroom management	151	behavior (0.085), classroom (0.063), game (0.054), disruptive (0.052), students (0.048), good (0.041), student (0.036), group (0.036), management (0.035), teachers (0.035)
10	Speech and language therapy	133	treatment (0.062), hearing (0.051), speech (0.049), study (0.028), single (0.022), results (0.022), language (0.022), therapy (0.022), design (0.022), subject (0.022)
11	Mathematics instruction	132	students (0.092), learning (0.039), solving (0.039), virtual (0.038), instruction (0.038), disabilities (0.032), problems (0.032), intervention (0.028), problem solving (0.026), study (0.026)
12	Social skills interventions	124	social (0.091), peer (0.061), asd (0.049), peers (0.047), children (0.045), skills (0.035), interactions (0.033), intervention (0.031), interaction (0.029), communication (0.028)
13	Assistive technology	80	technology (0.058), participants (0.055), disabilities (0.048), multiple (0.040), intellectual (0.039), people (0.038), study (0.033), activity (0.031), smartphone (0.029), persons (0.028)
14	Computer-based instruction	77	instruction (0.077), relations (0.077), stimulus (0.053), students (0.040), college (0.037), programmed (0.035), participants (0.034), based (0.033), classes (0.033), based instruction (0.033)
15	Cognitive rehabilitation	67	brain (0.068), injury (0.063), cognitive (0.039), rehabilitation (0.036), intervention (0.032), participants (0.030), single (0.027), case (0.026), study (0.025), single case (0.025)

Semantic-Based Topic Modeling¶

Setting up the computing environment¶

Combine text columns¶

Preprocess documents¶

Conduct topic modeling¶

Outlier Reduction¶

Hierarchical Clustering¶

Identify representative words and scores for each topic¶

Visualize the semantic embedding space of documents by topic¶

Calculate cosine similarity among topics¶

Ngrams¶