Semantic-Based Topic Modeling¶

In [1]:
from datetime import datetime
date = datetime.now()
formatted_date = date.strftime("%B %d, %Y")
print(formatted_date)
March 04, 2026
In [2]:
# Check GPU memory
!nvidia-smi
Wed Mar  4 04:10:26 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             44W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+
In [3]:
# Check system RAM
!free -h
               total        used        free      shared  buff/cache   available
Mem:            83Gi       1.3Gi        78Gi       2.0Mi       3.7Gi        81Gi
Swap:             0B          0B          0B

Setting up the computing environment¶

In [4]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
userdata.get('HF_TOKEN')

# Set up the current working directory within the Google Drive
%cd /content/drive/My\ Drive/Colab\ Notebooks/LLM/sped_biblio/topic_modeling
Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/LLM/sped_biblio/topic_modeling
In [6]:
!pip install --upgrade -q "numpy>=2.0" "pandas<2.3" openpyxl sentence-transformers bertopic scikit-learn matplotlib umap-learn hdbscan python-dotenv openai
In [8]:
import os
import warnings
warnings.filterwarnings('ignore', category=SyntaxWarning, module='hdbscan', message="invalid escape sequence '\\{'")

import re
from collections import defaultdict
import pickle
from pickle import UnpicklingError
from pickle import PicklingError
import itertools

# Data Manipulation
import numpy as np
import pandas as pd
import requests

# Natural Language Processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

# Generating Topic Labels
import openai
from dotenv import load_dotenv

# Clustering
from hdbscan import HDBSCAN
from umap import UMAP
from scipy.cluster import hierarchy as sch

# Visualization Imports
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.colors as pc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.ticker import FuncFormatter
import colorlover as cl
import textwrap

# Progress Bar
from tqdm import tqdm

# Display HTML
from IPython.display import IFrame
In [9]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_topic_labels(topic_info, topic_model, openai_model="gpt-4o", max_tokens=10, temperature=0.3):

    gen_names = []

    for topic_id in topic_info['Topic']:
        if topic_id == -1:
            gen_names.append("Outlier")
            continue

        keywords = topic_model.get_topic(topic_id)
        if keywords:
            top_keywords = ", ".join([keyword[0] for keyword in keywords[:10]])
            prompt = f"""
You are a highly skilled data scientist specializing in generating concise and descriptive topic labels based on provided top terms for each topic.
Each topic consists of a list of terms ordered from most to least significant.

Your objective is to create precise and concise labels that capture the essence of each topic by following these guidelines:

1. Use Person-First Language:
   - Prioritize respectful and inclusive language.
   - Avoid terms that may be considered offensive or stigmatizing.
   - For example, use "students with learning disabilities" instead of "disabled students".

2. Analyze the significance of the top terms:
   - Focus primarily on the most significant terms.
   - Include additional terms if they add essential context.

3. Synthesize the Topic Label:
   - Ensure clarity and conciseness (aim for 4-5 words).
   - Reflect the collective meaning of the most influential terms.
   - Use descriptive yet precise phrasing.

4. Maintain consistency:
   - Capitalize the first word using title case.
   - Use uniform formatting and avoid ambiguity.
   - Make consie and complete expressions.

Example
----------
Top 10 Keywords in [Representation]:
virtual manipulatives, manipulatives, mathematical, app, solving, learning disability, algebra, area, tool, concrete manipulatives

Generated Topic Label in [GenName]:
Visual-based technology for mathematical problem solving

Top 10 Keywords: {top_keywords}
Generated Topic Label in [GenName]:
"""
            response = openai.chat.completions.create(
                model=openai_model,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=temperature
            )
            gen_name = response.choices[0].message.content.strip()
            gen_name = re.sub(r'Generated Topic Label in \[GenName\]:', '', gen_name, flags=re.IGNORECASE).strip()
            gen_names.append(gen_name)
        else:
            gen_names.append("No Keywords")

    topic_info['GenName'] = gen_names
    return topic_info

Combine text columns¶

In [10]:
all_data_file = f"files/all_data.xlsx"
all_data = pd.read_excel(all_data_file, na_filter=False)

df = all_data[all_data['filtered'] == 'Yes'].reset_index(drop=True)
df['Year'] = df['PY'].astype(int)
df['Decade'] = (df['Year'] // 10) * 10
df['CR'] = df['CR'].astype(str)

Preprocess documents¶

In [11]:
import logging
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
logging.getLogger("transformers").setLevel(logging.WARNING)
modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]
config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]
README.md: 0.00B [00:00, ?B/s]
sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]
config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]
Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]
tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]
vocab.txt: 0.00B [00:00, ?B/s]
tokenizer.json: 0.00B [00:00, ?B/s]
special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]
config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]
In [12]:
publication_embeddings = sentence_model.encode(df['combined_text'].tolist(), show_progress_bar=True)
publication_embeddings_df = pd.DataFrame(publication_embeddings)
publication_embeddings_df['UT'] = df['UT'].tolist()
Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Conduct topic modeling¶

In [13]:
umap_model = UMAP(
    n_neighbors=20,
    n_components=2,
    min_dist=0.00,
    metric='cosine',
    random_state=42
)

reduced_embeddings = umap_model.fit_transform(publication_embeddings_df.drop(columns='UT'))
reduced_embeddings_df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])
reduced_embeddings_df['UT'] = publication_embeddings_df['UT'].tolist()

hdbscan_model = HDBSCAN(
    min_cluster_size=35,
    min_samples=25,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

hdbscan_model.fit(reduced_embeddings)
labels = hdbscan_model.labels_

vectorizer_model =  CountVectorizer(min_df=10)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()

topic_model = BERTopic(
  embedding_model=sentence_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,
  calculate_probabilities=True,
  verbose=True
)

topics, probs = topic_model.fit_transform(df['combined_text'])
2026-03-04 04:15:42,451 - BERTopic - Embedding - Transforming documents to embeddings.
Batches:   0%|          | 0/114 [00:00<?, ?it/s]
2026-03-04 04:15:45,827 - BERTopic - Embedding - Completed ✓
2026-03-04 04:15:45,828 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-03-04 04:16:10,753 - BERTopic - Dimensionality - Completed ✓
2026-03-04 04:16:10,754 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-03-04 04:16:11,031 - BERTopic - Cluster - Completed ✓
2026-03-04 04:16:11,035 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-03-04 04:16:12,160 - BERTopic - Representation - Completed ✓
In [14]:
topic_model.get_topic_info()
Out[14]:
Topic Count Name Representation Representative_Docs
0 -1 1054 -1_interventions_assessment_behavioral_behaviors [interventions, assessment, behavioral, behavi... [ONLINE INDIRECT GROUP TREATMENT FOR PRESCHOOL...
1 0 391 0_autism_research_interventions_studies [autism, research, interventions, studies, the... [EVALUATION OF A TELEHEALTH ABA PROGRAM FOR CA...
2 1 250 1_analyses_assessment_assessments_behaviors [analyses, assessment, assessments, behaviors,... [FUNCTIONAL ANALYSIS OF ABERRANT BEHAVIOR MAIN...
3 2 214 2_monitoring_behavioral_design_activity [monitoring, behavioral, design, activity, phy... [A MOBILE SELF-CONTROL TRAINING APP TO IMPROVE...
4 3 195 3_research_studies_study_interventions [research, studies, study, interventions, desi... [A MOTHER'S USE OF READING FLUENCY STRATEGIES:...
5 4 191 4_interventions_studies_study_research [interventions, studies, study, research, effe... [EFFECTS OF AN IPAD-BASED SPEECH-GENERATING DE...
6 5 170 5_interventions_effectiveness_study_behaviors [interventions, effectiveness, study, behavior... [COMPARISON OF LIVE MODELING AND VIDEO MODELIN...
7 6 157 6_analyses_assessing_software_research [analyses, assessing, software, research, visu... [ANALYSIS OF RISK OF BIAS ASSESSMENTS IN A SAM...
8 7 132 7_intervention_design_rehabilitation_therapy [intervention, design, rehabilitation, therapy... [FORCED USE OF THE UPPER EXTREMITY IN CEREBRAL...
9 8 114 8_disabilities_effectiveness_study_disability [disabilities, effectiveness, study, disabilit... [COMPARING THE EFFECTIVENESS OF TWO VIDEO FADI...
10 9 112 9_interventions_intervention_disabilities_skills [interventions, intervention, disabilities, sk... [VIDEO MODELING AND EXPLICIT INSTRUCTION: A CO...
11 10 98 10_experiences_methodology_research_study [experiences, methodology, research, study, ex... [LEADING CHANGE: A MULTIPLE-CASE STUDY OF LEAD...
12 11 80 11_interventions_behavioral_behaviors_behavior [interventions, behavioral, behaviors, behavio... [THE EFFECTS OF STUDENT- AND TEACHER-LED TOOTL...
13 12 61 12_rehabilitation_study_procedures_studies [rehabilitation, study, procedures, studies, p... [USING A DIGITAL SPELLING AID TO IMPROVE WRITI...
14 13 59 13_behavioral_methodology_development_research [behavioral, methodology, development, researc... [TRANSLATING BEHAVIOR ANALYSIS: A SPECTRUM RAT...
15 14 58 14_autism_interventions_intervention_behaviors [autism, interventions, intervention, behavior... [EXPANDING COMMUNICATION MODALITIES AND FUNCTI...
16 15 56 15_analyses_study_instructional_effectiveness [analyses, study, instructional, effectiveness... [EVALUATING THE USE OF VIDEO MODELING WITH VOI...
17 16 54 16_study_impairments_impairment_disabilities [study, impairments, impairment, disabilities,... [ENABLING PEOPLE WITH INTELLECTUAL AND SENSORY...
18 17 53 17_usability_effectiveness_efficacy_impairments [usability, effectiveness, efficacy, impairmen... [TEXT MESSAGES REDUCE MEMORY FAILURES IN ADULT...
19 18 51 18_psychology_stimulus_tests_treatments [psychology, stimulus, tests, treatments, test... [TEACHING STATISTICAL VARIABILITY WITH EQUIVAL...
20 19 45 19_stimulation_efficacy_questionnaire_visual [stimulation, efficacy, questionnaire, visual,... [EVALUATION OF THE OPTIMIZED PITCH AND LANGUAG...
21 20 42 20_analyses_interventions_behavioral_skills [analyses, interventions, behavioral, skills, ... [BEHAVIORAL INTERVENTIONS TO IMPROVE PERFORMAN...

Outlier Reduction¶

In [15]:
df["combined_text"] = (
    df["combined_text"]
    .str.replace(
        r"(?i)\baugmentative and alternative communication\b",
        "AAC",
        regex=True
    )
    .str.replace(
        r"(?i)\bapplied behavior analysis\b",
        "ABA",
        regex=True
    )
    .str.replace(
        r"(?i)\bautism spectrum disorder\b",
        "ASD",
        regex=True
    )
)

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
custom_stop_words = list(ENGLISH_STOP_WORDS - {"what"})

vectorizer_model = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1, 3), min_df=10)
topic_model.update_topics(df['combined_text'], vectorizer_model=vectorizer_model)
In [16]:
new_topics = topic_model.reduce_outliers(df['combined_text'], topics, strategy="c-tf-idf")
In [17]:
import logging
logging.getLogger("BERTopic").setLevel(logging.ERROR)
topic_model.update_topics(df['combined_text'], topics=new_topics, vectorizer_model=vectorizer_model)
logging.getLogger("BERTopic").setLevel(logging.WARNING)
In [18]:
topic_model.get_topic_info()
Out[18]:
Topic Count Name Representation Representative_Docs
0 0 476 0_aba_parent_children_parents [aba, parent, children, parents, telehealth, t... [EVALUATION OF A TELEHEALTH ABA PROGRAM FOR CA...
1 1 305 1_reinforcement_behavior_response_functional [reinforcement, behavior, response, functional... [FUNCTIONAL ANALYSIS OF ABERRANT BEHAVIOR MAIN...
2 2 308 2_treatment_health_intervention_therapy [treatment, health, intervention, therapy, stu... [A MOBILE SELF-CONTROL TRAINING APP TO IMPROVE...
3 3 211 3_reading_students_words_writing [reading, students, words, writing, vocabulary... [A MOTHER'S USE OF READING FLUENCY STRATEGIES:...
4 4 245 4_aac_communication_speech_children [aac, communication, speech, children, asd, au... [EFFECTS OF AN IPAD-BASED SPEECH-GENERATING DE...
5 5 283 5_video_modeling_video modeling_asd [video, modeling, video modeling, asd, skills,... [COMPARISON OF LIVE MODELING AND VIDEO MODELIN...
6 6 198 6_data_case_single_single case [data, case, single, single case, analysis, de... [ANALYSIS OF RISK OF BIAS ASSESSMENTS IN A SAM...
7 7 192 7_motor_subject_single_design [motor, subject, single, design, rehabilitatio... [FORCED USE OF THE UPPER EXTREMITY IN CEREBRAL...
8 8 207 8_intellectual_disabilities_students_video [intellectual, disabilities, students, video, ... [COMPARING THE EFFECTIVENESS OF TWO VIDEO FADI...
9 9 132 9_students_mathematics_solving_virtual [students, mathematics, solving, virtual, lear... [VIDEO MODELING AND EXPLICIT INSTRUCTION: A CO...
10 10 141 10_design_learning_research_education [design, learning, research, education, study,... [LEADING CHANGE: A MULTIPLE-CASE STUDY OF LEAD...
11 11 151 11_behavior_classroom_game_disruptive [behavior, classroom, game, disruptive, studen... [THE EFFECTS OF STUDENT- AND TEACHER-LED TOOTL...
12 12 82 12_treatment_production_language_speech [treatment, production, language, speech, ther... [USING A DIGITAL SPELLING AID TO IMPROVE WRITI...
13 13 89 13_behavior_behavior analysis_analysis_aba [behavior, behavior analysis, analysis, aba, b... [TRANSLATING BEHAVIOR ANALYSIS: A SPECTRUM RAT...
14 14 124 14_social_peer_peers_asd [social, peer, peers, asd, children, skills, i... [EXPANDING COMMUNICATION MODALITIES AND FUNCTI...
15 15 152 15_training_staff_feedback_skills [training, staff, feedback, skills, video, ski... [EVALUATING THE USE OF VIDEO MODELING WITH VOI...
16 16 80 16_technology_participants_disabilities_intell... [technology, participants, disabilities, intel... [ENABLING PEOPLE WITH INTELLECTUAL AND SENSORY...
17 17 67 17_memory_brain_brain injury_injury [memory, brain, brain injury, injury, cognitiv... [TEXT MESSAGES REDUCE MEMORY FAILURES IN ADULT...
18 18 77 18_relations_instruction_stimulus_students [relations, instruction, stimulus, students, c... [TEACHING STATISTICAL VARIABILITY WITH EQUIVAL...
19 19 51 19_hearing_speech_recognition_patients [hearing, speech, recognition, patients, subje... [EVALUATION OF THE OPTIMIZED PITCH AND LANGUAG...
20 20 66 20_performance_matching_feedback_behavioral [performance, matching, feedback, behavioral, ... [BEHAVIORAL INTERVENTIONS TO IMPROVE PERFORMAN...

Hierarchical Clustering¶

In [19]:
hierarchical_topics_init = topic_model.hierarchical_topics(df['combined_text'])

fig_hierarchical_init = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics_init)

fig_hierarchical_init.update_layout(
    width=1000,
    height=800,
    margin=dict(t=60, b=60, l=120, r=60),
    plot_bgcolor='white',
    paper_bgcolor='white',
    font_size=14,
    title="Hierarchical Topic Clustering"
)

fig_hierarchical_init.update_layout(width=None, height=None, autosize=True)
fig_hierarchical_init.write_html(
    "results/fig_hierarchical_init.html",
    config={"responsive": True}
)

# fig_hierarchical_init.show()
100%|██████████| 20/20 [00:00<00:00, 284.56it/s]
In [20]:
IFrame(src='results/fig_hierarchical_init.html', width=1000, height=800)
Out[20]:
In [21]:
topics_to_merge = [[2, 7],
                  [6, 10],
                  [1, 13],
                  [5, 8],
                  [12, 19],
                  [15, 20]]
topic_model.merge_topics(df['combined_text'], topics_to_merge)
In [22]:
hierarchical_topics = topic_model.hierarchical_topics(df['combined_text'])

fig_hierarchical = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

fig_hierarchical.update_layout(
    width=1000,
    height=800,
    margin=dict(t=60, b=60, l=120, r=60),
    plot_bgcolor='white',
    paper_bgcolor='white',
    font_size=14,
    title="Hierarchical Topic Clustering"
)

fig_hierarchical.update_layout(width=None, height=None, autosize=True)
fig_hierarchical.write_html(
    "results/fig_hierarchical.html",
    config={"responsive": True}
)

# fig_hierarchical.show()
100%|██████████| 14/14 [00:00<00:00, 286.07it/s]
In [23]:
IFrame(src='results/fig_hierarchical.html', width=1000, height=800)
Out[23]:
In [24]:
document_info = topic_model.get_document_info(df["combined_text"])
df["Topic"] = document_info["Topic"] + 1

topic_info = topic_model.get_topic_info()

topic_info_gen = generate_topic_labels(topic_info, topic_model)

topic_model.set_topic_labels(topic_info_gen['GenName'].tolist())

for i in range(probs.shape[1]):
    df[f'Prob_Topic_{i}'] = probs[:, i]

data = []
for topic in topic_info['Topic']:
    words_with_scores = topic_model.get_topic(topic)
    for word, score in words_with_scores:
        data.append({"Topic": topic, "Word": word, "Score": score})

topic_model_words = pd.DataFrame(data)

topic_info_words = topic_info.merge(topic_model_words, how="left", on="Topic") \
                                                      .sort_values(by=['Topic', 'Score'], ascending=[True, False])

topic_info['Topic'] = topic_info['Topic'] + 1
topic_info_words['Topic'] = topic_info_words['Topic'] + 1
In [25]:
topic_info_gen
Out[25]:
Topic Count Name Representation Representative_Docs GenName
0 1 500 0_study_intervention_treatment_design [study, intervention, treatment, design, singl... [EFFECTIVENESS OF TRAUMA-FOCUSED ART THERAPY (... Single-Case Health Intervention Study
1 2 490 1_video_skills_modeling_video modeling [video, skills, modeling, video modeling, inte... [EFFECTS OF VIDEO MODELING WITH VIDEO FEEDBACK... Video Modeling for Social Skills in Children w...
2 3 476 2_aba_children_parent_training [aba, children, parent, training, parents, tel... [TELEHEALTH AND AUTISM: TREATING CHALLENGING B... Telehealth Interventions for Children with Autism
3 4 394 3_behavior_reinforcement_response_analysis [behavior, reinforcement, response, analysis, ... [THE EFFECT OF RULES ON DIFFERENTIAL REINFORCE... Behavioral Analysis and Intervention Strategies
4 5 339 4_data_design_case_single [data, design, case, single, analysis, single ... [WHEN THE TRUTH HITS YOU BETWEEN THE EYES A SO... Single-Case Visual Data Analysis
5 6 245 5_aac_communication_speech_children [aac, communication, speech, children, asd, au... [TRANSITIONING FROM A LOW- TO HIGH-TECH AAC (A... Augmentative Communication for Children with A...
6 7 218 6_training_feedback_staff_performance [training, feedback, staff, performance, skill... [THE USE OF BEHAVIORAL SKILLS TRAINING TO TEAC... Staff Training and Performance Enhancement
7 8 211 7_reading_students_words_instruction [reading, students, words, instruction, vocabu... [TECHNOLOGY-ASSISTED READING FLUENCY INTERVENT... Reading Instruction and Vocabulary Development
8 9 151 8_behavior_classroom_game_disruptive [behavior, classroom, game, disruptive, studen... [EVALUATION OF THE GOOD BEHAVIOR GAME USING CL... Classroom Behavior and Management Strategies
9 10 133 9_treatment_hearing_speech_study [treatment, hearing, speech, study, single, re... [USING TREATMENT TO IMPROVE THE PRODUCTION OF ... Speech and Hearing Therapy Research
10 11 132 10_students_learning_solving_virtual [students, learning, solving, virtual, instruc... [USING A VIRTUAL NUMBER LINE AND CORRECTIVE FE... Instructional Strategies for Students with Dis...
11 12 124 11_social_peer_asd_peers [social, peer, asd, peers, children, skills, i... [INTERACTION AMONG PRESCHOOLERS WITH AND WITHO... Social Interaction Skills for Children with ASD
12 13 80 12_technology_participants_disabilities_multiple [technology, participants, disabilities, multi... [A SMARTPHONE-BASED PROGRAM ENABLING PEOPLE WI... Smartphone Technology for People with Intellec...
13 14 77 13_instruction_relations_stimulus_students [instruction, relations, stimulus, students, c... [USING STIMULUS EQUIVALENCE-BASED INSTRUCTION ... Programmed Instruction for College Students
14 15 67 14_brain_injury_cognitive_rehabilitation [brain, injury, cognitive, rehabilitation, int... [EFFICACY OF ELECTRONIC PORTABLE ASSISTIVE DEV... Cognitive Rehabilitation for Brain Injury
In [26]:
topic_info.to_excel("files/topic_info.xlsx", engine='openpyxl', index=False)
topic_info_words.to_excel("files/topic_info_words.xlsx", engine='openpyxl', index=False)
df.to_excel("files/df.xlsx", engine='openpyxl', index=False)
In [27]:
# topic_info = pd.read_excel("files/topic_info.xlsx")
# topic_info_words = pd.read_excel("files/topic_info_words.xlsx")
# df = pd.read_excel("files/df.xlsx")
topic_info_human_loop = pd.read_excel("files/topic_info_human_loop.xlsx")
In [28]:
topic_info_human_loop
Out[28]:
Topic Count Name Representation Representative_Docs GenName CustomLabel
0 1 500 0_study_intervention_treatment_design ['study', 'intervention', 'treatment', 'design... ['EFFECTIVENESS OF TRAUMA-FOCUSED ART THERAPY ... Single-Case Health Intervention Study Health Interventions
1 2 490 1_video_skills_modeling_video modeling ['video', 'skills', 'modeling', 'video modelin... ["EFFECTS OF VIDEO MODELING WITH VIDEO FEEDBAC... Video Modeling for Social Skills in Children w... Video modeling
2 3 476 2_aba_children_parent_training ['aba', 'children', 'parent', 'training', 'par... ['TELEHEALTH AND AUTISM: TREATING CHALLENGING ... Telehealth Interventions for Children with Autism Telehealth services
3 4 394 3_behavior_reinforcement_response_analysis ['behavior', 'reinforcement', 'response', 'ana... ['THE EFFECT OF RULES ON DIFFERENTIAL REINFORC... Behavioral Analysis and Intervention Strategies Function-based interventions
4 5 339 4_data_design_case_single ['data', 'design', 'case', 'single', 'analysis... ['WHEN THE TRUTH HITS YOU BETWEEN THE EYES A S... Visual Analysis in Single-Case Research Single-case design analysis
5 6 245 5_aac_communication_speech_children ['aac', 'communication', 'speech', 'children',... ["TRANSITIONING FROM A LOW- TO HIGH-TECH AAC (... Augmentative Communication for Children with A... AAC interventions
6 7 218 6_training_feedback_staff_performance ['training', 'feedback', 'staff', 'performance... ["THE USE OF BEHAVIORAL SKILLS TRAINING TO TEA... Staff Training and Performance Enhancement Staff training
7 8 211 7_reading_students_words_instruction ['reading', 'students', 'words', 'instruction'... ['TECHNOLOGY-ASSISTED READING FLUENCY INTERVEN... Reading Instruction and Vocabulary Development Reading and writing instruction
8 9 151 8_behavior_classroom_game_disruptive ['behavior', 'classroom', 'game', 'disruptive'... ["EVALUATION OF THE GOOD BEHAVIOR GAME USING C... Classroom Behavior and Management Strategies Classroom management
9 10 133 9_treatment_hearing_speech_study ['treatment', 'hearing', 'speech', 'study', 's... ['USING TREATMENT TO IMPROVE THE PRODUCTION OF... Speech and Hearing Therapy Research Speech and language therapy
10 11 132 10_students_learning_solving_virtual ['students', 'learning', 'solving', 'virtual',... ['USING A VIRTUAL NUMBER LINE AND CORRECTIVE F... Instructional Strategies for Students with Dis... Mathematics instruction
11 12 124 11_social_peer_asd_peers ['social', 'peer', 'asd', 'peers', 'children',... ["INTERACTION AMONG PRESCHOOLERS WITH AND WITH... Social Interaction Skills for Children with ASD Social skills interventions
12 13 80 12_technology_participants_disabilities_multiple ['technology', 'participants', 'disabilities',... ["A SMARTPHONE-BASED PROGRAM ENABLING PEOPLE W... Smartphone Use by People with Intellectual Dis... Assistive technology
13 14 77 13_instruction_relations_stimulus_students ['instruction', 'relations', 'stimulus', 'stud... ['USING STIMULUS EQUIVALENCE-BASED INSTRUCTION... Programmed Instruction for College Students Computer-based instruction
14 15 67 14_brain_injury_cognitive_rehabilitation ['brain', 'injury', 'cognitive', 'rehabilitati... ['EFFICACY OF ELECTRONIC PORTABLE ASSISTIVE DE... Cognitive Rehabilitation for Brain Injury Cognitive rehabilitation
In [29]:
#topic_info = topic_info.drop(columns=['Topic'])
#topic_info_concat = pd.merge(topic_info, topic_info_human_loop[['Name', 'Topic', 'CustomLabel']], how="left", on="Name")
topic_info_concat = pd.merge(topic_info, topic_info_human_loop[['Name', 'CustomLabel']], how="left", on="Name")
order = ['Topic', 'Count', 'Name', 'Representation', 'GenName', 'CustomLabel']
topic_info_concat = topic_info_concat.reindex(columns=order)

#topic_info_words = topic_info_words.drop(columns=['Topic'])
#topic_info_words_concat = pd.merge(topic_info_words, topic_info_human_loop[['Name', 'Topic', 'CustomLabel']], how="left", on="Name")
topic_info_words_concat = pd.merge(topic_info_words, topic_info_human_loop[['Name', 'CustomLabel']], how="left", on="Name")
order = ['Topic', 'Count', 'Name', 'Representation', 'Word', 'Score', 'Representative_Docs', 'GenName', 'CustomLabel']
topic_info_words_concat = topic_info_words_concat.reindex(columns=order)
In [30]:
files = {
    "files/topic_model.pkl": topic_model,
    "files/publication_embeddings.pkl": publication_embeddings,
    "files/publication_embeddings_df.pkl": publication_embeddings_df,
    "files/reduced_embeddings.pkl": reduced_embeddings,
    "files/reduced_embeddings_df.pkl": reduced_embeddings_df,
    "files/topic_info.pkl": topic_info,
    "files/topic_info_concat.pkl": topic_info_concat,
    "files/topic_info_words.pkl": topic_info_words,
    "files/topic_info_words_concat.pkl": topic_info_words_concat,
    "files/df.pkl": df
}

for filename, data in files.items():
    if data is None:
        continue

    try:
        with open(filename, "wb") as f:
            pickle.dump(data, f)
    except Exception as e:
        print(f"Error saving pickle for {filename}: {e}")

    try:
        excel_filename = filename.rsplit('.', 1)[0] + ".xlsx"
        if isinstance(data, pd.DataFrame):
            data.to_excel(excel_filename, index=False)
        elif isinstance(data, np.ndarray) and data.ndim in [1, 2]:
            pd.DataFrame(data).to_excel(excel_filename, index=False)
    except Exception as e:
        print(f"Error saving Excel for {filename}: {e}")
In [ ]:
#def load_data_from_files(files_dict):
#   loaded_data = {}
#   for filename, var_name in files_dict.items():
#       filepath = os.path.join('files', filename)
#       if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
#           try:
#               with open(filepath, "rb") as f:
#                   loaded_data[var_name] = pickle.load(f)
#           except Exception as e:
#               loaded_data[var_name] = None
#               print(f"Error loading {filepath}: {e}")
#       else:
#           loaded_data[var_name] = None
#   return loaded_data

#files_to_load = {
#   "topic_model.pkl": "topic_model",
#   "publication_embeddings.pkl": "publication_embeddings",
#   "publication_embeddings_df.pkl": "publication_embeddings_df",
#   "reduced_embeddings.pkl": "reduced_embeddings",
#   "reduced_embeddings_df.pkl": "reduced_embeddings_df",
#   "topic_info.pkl": "topic_info",
#   "topic_info_concat.pkl": "topic_info_concat",
#   "topic_info_words.pkl": "topic_info_words",
#   "topic_info_words_concat.pkl": "topic_info_words_concat",
#   "df.pkl": "df"
#}

#loaded_data = load_data_from_files(files_to_load)
#topic_model = loaded_data.get("topic_model")
#publication_embeddings = loaded_data.get("publication_embeddings")
#publication_embeddings_df = loaded_data.get("publication_embeddings_df")
#reduced_embeddings = loaded_data.get("reduced_embeddings")
#reduced_embeddings_df = loaded_data.get("reduced_embeddings_df")
#topic_info = loaded_data.get("topic_info")
#topic_info_concat = loaded_data.get("topic_info_concat")
#topic_info_words = loaded_data.get("topic_info_words")
#topic_info_words_concat = loaded_data.get("topic_info_words_concat")
#df = loaded_data.get("df")

Identify representative words and scores for each topic¶

In [31]:
topic_word_data = topic_info_words_concat.copy()

topic_word_data['Word (c-TF-IDF)'] = topic_word_data.apply(
    lambda row: f"{row['Word']} ({row['Score']:.3f})", axis=1
)

topic_word_data = topic_word_data.sort_values(by=['CustomLabel', 'Score'], ascending=[True, False])

topic_word_table = topic_word_data.groupby('CustomLabel', sort=False).agg(
    Topic=('Topic', 'first'),
    Count=('Count', 'first'),
    **{'Word (c-TF-IDF)': ('Word (c-TF-IDF)', lambda x: ', '.join(x))}
).reset_index()[['Topic', 'CustomLabel', 'Count', 'Word (c-TF-IDF)']]

topic_word_table = topic_word_table.sort_values(by='Topic')

styles = [
    dict(selector="caption", props=[("caption-side", "top")]),
    {'selector': 'th', 'props': [('text-align', 'center')]},
    {'selector': 'td', 'props': [('text-align', 'left')]}
]

topic_word_table = (
    topic_word_table.style
    .set_caption("<b>Topic Word Score</b>")
    .set_table_styles(styles)
    .set_properties(**{
        'font-size': '24px',
        'font-family': 'Helvetica Neue',
        'color': 'black',
        'text-align': 'center'
    })
    .set_properties(**{
        'text-align': 'center',
        'font-size': '14px',
        'font-family': 'Helvetica Neue',
        'color': 'black',
    }, subset=pd.IndexSlice[:, topic_word_table.columns])
    .hide(axis='index')
)
In [32]:
topic_word_table
Out[32]:
Topic Word Score
Topic CustomLabel Count Word (c-TF-IDF)
1 Health Interventions 500 study (0.028), intervention (0.027), treatment (0.026), design (0.026), single (0.025), therapy (0.025), participants (0.023), baseline (0.022), case (0.021), health (0.019)
2 Video modeling 490 video (0.063), skills (0.045), modeling (0.041), video modeling (0.035), intellectual (0.030), asd (0.029), children (0.028), autism (0.028), students (0.027), social (0.026)
3 Telehealth services 476 aba (0.053), children (0.034), parent (0.033), training (0.032), parents (0.032), telehealth (0.031), asd (0.028), intervention (0.027), autism (0.027), behavior (0.023)
4 Function-based interventions 394 behavior (0.077), reinforcement (0.077), response (0.038), analysis (0.038), functional (0.033), functional analysis (0.031), automatic (0.029), treatment (0.029), problem (0.027), problem behavior (0.027)
5 Single-case design analysis 339 data (0.049), design (0.048), case (0.045), single (0.040), analysis (0.038), single case (0.037), research (0.034), learning (0.026), designs (0.025), visual (0.024)
6 AAC interventions 245 aac (0.073), communication (0.070), speech (0.045), children (0.037), asd (0.033), autism (0.027), intervention (0.027), picture (0.024), participants (0.022), studies (0.022)
7 Staff training 218 training (0.080), feedback (0.053), staff (0.046), performance (0.042), skills (0.035), behavioral (0.032), video (0.032), skills training (0.028), participants (0.025), teaching (0.021)
8 Reading and writing instruction 211 reading (0.100), students (0.053), words (0.037), instruction (0.032), vocabulary (0.032), word (0.032), intervention (0.028), learning (0.027), fluency (0.025), study (0.024)
9 Classroom management 151 behavior (0.085), classroom (0.063), game (0.054), disruptive (0.052), students (0.048), good (0.041), student (0.036), group (0.036), management (0.035), teachers (0.035)
10 Speech and language therapy 133 treatment (0.062), hearing (0.051), speech (0.049), study (0.028), single (0.022), results (0.022), language (0.022), therapy (0.022), design (0.022), subject (0.022)
11 Mathematics instruction 132 students (0.092), learning (0.039), solving (0.039), virtual (0.038), instruction (0.038), disabilities (0.032), problems (0.032), intervention (0.028), problem solving (0.026), study (0.026)
12 Social skills interventions 124 social (0.091), peer (0.061), asd (0.049), peers (0.047), children (0.045), skills (0.035), interactions (0.033), intervention (0.031), interaction (0.029), communication (0.028)
13 Assistive technology 80 technology (0.058), participants (0.055), disabilities (0.048), multiple (0.040), intellectual (0.039), people (0.038), study (0.033), activity (0.031), smartphone (0.029), persons (0.028)
14 Computer-based instruction 77 instruction (0.077), relations (0.077), stimulus (0.053), students (0.040), college (0.037), programmed (0.035), participants (0.034), based (0.033), classes (0.033), based instruction (0.033)
15 Cognitive rehabilitation 67 brain (0.068), injury (0.063), cognitive (0.039), rehabilitation (0.036), intervention (0.032), participants (0.030), single (0.027), case (0.026), study (0.025), single case (0.025)

Visualize the semantic embedding space of documents by topic¶

In [33]:
color_map = [
    '#EF5350',
    '#42A5F5',
    '#66BB6A',
    '#FFA726',
    '#AB47BC',
    '#26A69A',
    '#FFEE58',
    '#FF8A65',
    '#EC407A',
    '#90A4AE',
    '#C0CA33',
    '#4DD0E1',
    '#A1887F',
    '#CE93D8',
    '#7986CB'
]

docs_topics_data = pd.merge(df, topic_info_concat[['Topic', 'CustomLabel']], on='Topic', how='left')
docs_topics_data['Topic'] = pd.to_numeric(docs_topics_data['Topic'], errors='coerce')
docs_topics_data = pd.merge(docs_topics_data, reduced_embeddings_df[['UT', 'x', 'y']], on='UT', how='left')
docs_topics_data['x'] = pd.to_numeric(docs_topics_data['x'], errors='coerce')
docs_topics_data['y'] = pd.to_numeric(docs_topics_data['y'], errors='coerce')
docs_topics_data = docs_topics_data.dropna(subset=['Topic', 'x', 'y'])
#docs_topics_data = df.copy()
docs_topics_data['Topic'] = docs_topics_data['Topic'].astype(int)

ordered_legend = docs_topics_data.sort_values('Topic')['CustomLabel'].unique().tolist()
ordered_legend = [x for x in ordered_legend if pd.notnull(x)]
docs_topics_data['legend_topic'] = pd.Categorical(docs_topics_data['CustomLabel'],
                                                    categories=ordered_legend,
                                                    ordered=True)
In [34]:
docs_topics_data['hover_text'] = (
    "Title: " +
    docs_topics_data['TI'].apply(lambda text: "<br>".join(textwrap.wrap(text, width=50))) +
    "<br><br>Topic " + docs_topics_data['Topic'].astype(str) + ": " + docs_topics_data['CustomLabel']
)

color_mapping = {cat: color_map[i % len(color_map)] for i, cat in enumerate(ordered_legend)}

fig_cluster = px.scatter(
    docs_topics_data,
    x='x',
    y='y',
    color='legend_topic',
    color_discrete_map=color_mapping,
    labels={'x': 'X', 'y': 'Y'},
    custom_data=['hover_text'],
    category_orders={'legend_topic': ordered_legend}
)

fig_cluster.update_traces(
    marker=dict(size=10, opacity=0.9, line=dict(width=1, color="white")),
    selector=dict(mode='markers'),
    hovertemplate='<b>%{customdata[0]}</b><extra></extra>'
)

fig_cluster.update_traces(
    hoverlabel=dict(font_size=16)
)

fig_cluster.update_layout(
    title="<b>Semantic Space of Documents by Topic</b>",
    title_x=0.5,
    title_font=dict(size=24, family="Helvetica Neue", color="black"),
    margin=dict(t=80, b=80, l=80, r=80),
    plot_bgcolor='white',
    paper_bgcolor='white',
    width=900,
    height=800,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.15,
        xanchor="center",
        x=0.5,
        title_text="",
        font=dict(size=14),
        title_font=dict(size=16)
    )
)

fig_cluster.update_layout(width=None, height=None, autosize=True)
fig_cluster.write_html(
    "results/fig_cluster.html",
    config={"responsive": True}
)
# fig_cluster.show()
In [35]:
IFrame(src='results/fig_cluster.html', width=900, height=1000)
Out[35]:

Calculate cosine similarity among topics¶

In [36]:
merged_df = pd.merge(df, publication_embeddings_df, on='UT')

embeddings = [col for col in publication_embeddings_df.columns if col != 'UT']
topic_embeddings = merged_df.groupby("Topic")[embeddings].mean()

cosine_sim_matrix = cosine_similarity(topic_embeddings)

cosine_sim_df = pd.DataFrame(
    cosine_sim_matrix,
    index=topic_embeddings.index,
    columns=topic_embeddings.index
)

topic_to_label = topic_info_concat.set_index('Topic')['CustomLabel'].to_dict()

hover_text = []
for i, row_topic in enumerate(cosine_sim_df.index):
    row_hover = []
    for j, col_topic in enumerate(cosine_sim_df.columns):
        row_label = topic_to_label.get(row_topic, f"Topic {row_topic}")
        col_label = topic_to_label.get(col_topic, f"Topic {col_topic}")
        similarity = cosine_sim_df.iloc[i, j]
        hover = f'Topic {row_topic}: {row_label}<br>Topic {col_topic}: {col_label}<br>Cosine Similarity: {similarity:.3f}'
        row_hover.append(hover)
    hover_text.append(row_hover)

fig_cosine_sim = px.imshow(
    cosine_sim_df,
    color_continuous_scale="YlGnBu",
    origin="lower",
    labels=dict(color="Similarity"),
    x=cosine_sim_df.columns,
    y=cosine_sim_df.index,
    text_auto=True
)

fig_cosine_sim.update_xaxes(
    tickmode='array',
    tickvals=cosine_sim_df.columns,
    ticktext=[str(topic) for topic in cosine_sim_df.columns],
    tickfont=dict(size=13)
)

fig_cosine_sim.update_yaxes(
    tickmode='array',
    tickvals=cosine_sim_df.index,
    ticktext=[str(topic) for topic in cosine_sim_df.index],
    tickfont=dict(size=13)
)

fig_cosine_sim.update_traces(
    texttemplate='%{z:.2f}',
    textfont=dict(size=13),
    customdata=hover_text,
    hovertemplate='%{customdata}<extra></extra>',
    hoverlabel=dict(font_size=13)
)

fig_cosine_sim.update_layout(
    title="<b>Cosine Similarity Among Topics</b>",
    title_x=0.5,
    title_font=dict(size=24, family="Helvetica Neue", color="black"),
    margin=dict(t=80, b=80, l=80, r=80),
    plot_bgcolor='white',
    paper_bgcolor='white',
    width=900,
    height=900,
    showlegend=True,
    legend_title_text="Topic",
    legend_title_font=dict(size=13),
    legend_font=dict(size=13),
    font=dict(size=13)
)

x_dom = fig_cosine_sim.layout.xaxis.domain
x_center = (x_dom[0] + x_dom[1]) / 2
x_len = 1 * (x_dom[1] - x_dom[0])

fig_cosine_sim.update_layout(
    coloraxis_colorbar=dict(
        orientation='h',
        x=x_center,
        xanchor='center',
        y=-0.13,
        yanchor='top',
        len=x_len,
        thickness=12
    )
)

fig_cosine_sim.update_layout(width=None, height=None, autosize=True)
fig_cosine_sim.write_html(
    "results/fig_cosine_sim.html",
    config={"responsive": True}
)
# fig_cosine_sim.show()
In [37]:
IFrame(src='results/fig_cosine_sim.html', width=900, height=900)
Out[37]:

Ngrams¶

In [38]:
vectorizer = vectorizer_model.build_analyzer()

df = docs_topics_data.copy()

def extract_ngrams(text):
    all_ngrams = vectorizer(text)
    unigrams = [ng for ng in all_ngrams if len(ng.split()) == 1]
    bigrams  = [ng for ng in all_ngrams if len(ng.split()) == 2]
    trigrams = [ng for ng in all_ngrams if len(ng.split()) == 3]
    return unigrams, bigrams, trigrams

df['unigrams'], df['bigrams'], df['trigrams'] = zip(*df['combined_text'].apply(extract_ngrams))
In [39]:
with open("files/df.pkl", "wb") as f_df:
    pickle.dump(df, f_df)

df.to_excel("files/df.xlsx", engine='openpyxl', index=False)
topic_word_table.to_excel("files/topic_word_table.xlsx", engine='openpyxl', index=False)
cosine_sim_df.to_excel("files/cosine_sim_df.xlsx", engine='openpyxl', index=False)
In [ ]:
# with open("files/df.pkl", "rb") as f_df:
#     df = pickle.load(f_df)

# df = pd.read_excel("files/df.xlsx")
# topic_word_table = pd.read_excel("files/topic_word_table.xlsx")
# cosine_sim_df = pd.read_excel("files/cosine_sim_df.xlsx")
In [41]:
from nbconvert import HTMLExporter
import nbformat

notebook_path = 'index.ipynb'
html_exporter = HTMLExporter()

with open(notebook_path, 'r', encoding='utf-8') as nb_file:
    notebook_content = nb_file.read()
    notebook = nbformat.reads(notebook_content, as_version=4)

if 'widgets' in notebook.metadata and 'application/vnd.jupyter.widget-state+json' in notebook.metadata['widgets']:
    if 'state' not in notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']:
        notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']['state'] = {}

html_output, _ = html_exporter.from_notebook_node(notebook)

with open('index.html', 'w', encoding='utf-8') as html_file:
    html_file.write(html_output)