Semantic-Based Topic Modeling¶
In [1]:
from datetime import datetime
date = datetime.now()
formatted_date = date.strftime("%B %d, %Y")
print(formatted_date)
March 04, 2026
In [2]:
# Check GPU memory
!nvidia-smi
Wed Mar 4 04:10:26 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.82.07 Driver Version: 580.82.07 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA A100-SXM4-40GB Off | 00000000:00:04.0 Off | 0 | | N/A 32C P0 44W / 400W | 0MiB / 40960MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+
In [3]:
# Check system RAM
!free -h
total used free shared buff/cache available Mem: 83Gi 1.3Gi 78Gi 2.0Mi 3.7Gi 81Gi Swap: 0B 0B 0B
Setting up the computing environment¶
In [4]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
userdata.get('HF_TOKEN')
# Set up the current working directory within the Google Drive
%cd /content/drive/My\ Drive/Colab\ Notebooks/LLM/sped_biblio/topic_modeling
Mounted at /content/drive /content/drive/My Drive/Colab Notebooks/LLM/sped_biblio/topic_modeling
In [6]:
!pip install --upgrade -q "numpy>=2.0" "pandas<2.3" openpyxl sentence-transformers bertopic scikit-learn matplotlib umap-learn hdbscan python-dotenv openai
In [8]:
import os
import warnings
warnings.filterwarnings('ignore', category=SyntaxWarning, module='hdbscan', message="invalid escape sequence '\\{'")
import re
from collections import defaultdict
import pickle
from pickle import UnpicklingError
from pickle import PicklingError
import itertools
# Data Manipulation
import numpy as np
import pandas as pd
import requests
# Natural Language Processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
# Generating Topic Labels
import openai
from dotenv import load_dotenv
# Clustering
from hdbscan import HDBSCAN
from umap import UMAP
from scipy.cluster import hierarchy as sch
# Visualization Imports
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.colors as pc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.ticker import FuncFormatter
import colorlover as cl
import textwrap
# Progress Bar
from tqdm import tqdm
# Display HTML
from IPython.display import IFrame
In [9]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def generate_topic_labels(topic_info, topic_model, openai_model="gpt-4o", max_tokens=10, temperature=0.3):
gen_names = []
for topic_id in topic_info['Topic']:
if topic_id == -1:
gen_names.append("Outlier")
continue
keywords = topic_model.get_topic(topic_id)
if keywords:
top_keywords = ", ".join([keyword[0] for keyword in keywords[:10]])
prompt = f"""
You are a highly skilled data scientist specializing in generating concise and descriptive topic labels based on provided top terms for each topic.
Each topic consists of a list of terms ordered from most to least significant.
Your objective is to create precise and concise labels that capture the essence of each topic by following these guidelines:
1. Use Person-First Language:
- Prioritize respectful and inclusive language.
- Avoid terms that may be considered offensive or stigmatizing.
- For example, use "students with learning disabilities" instead of "disabled students".
2. Analyze the significance of the top terms:
- Focus primarily on the most significant terms.
- Include additional terms if they add essential context.
3. Synthesize the Topic Label:
- Ensure clarity and conciseness (aim for 4-5 words).
- Reflect the collective meaning of the most influential terms.
- Use descriptive yet precise phrasing.
4. Maintain consistency:
- Capitalize the first word using title case.
- Use uniform formatting and avoid ambiguity.
- Make consie and complete expressions.
Example
----------
Top 10 Keywords in [Representation]:
virtual manipulatives, manipulatives, mathematical, app, solving, learning disability, algebra, area, tool, concrete manipulatives
Generated Topic Label in [GenName]:
Visual-based technology for mathematical problem solving
Top 10 Keywords: {top_keywords}
Generated Topic Label in [GenName]:
"""
response = openai.chat.completions.create(
model=openai_model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
temperature=temperature
)
gen_name = response.choices[0].message.content.strip()
gen_name = re.sub(r'Generated Topic Label in \[GenName\]:', '', gen_name, flags=re.IGNORECASE).strip()
gen_names.append(gen_name)
else:
gen_names.append("No Keywords")
topic_info['GenName'] = gen_names
return topic_info
Combine text columns¶
In [10]:
all_data_file = f"files/all_data.xlsx"
all_data = pd.read_excel(all_data_file, na_filter=False)
df = all_data[all_data['filtered'] == 'Yes'].reset_index(drop=True)
df['Year'] = df['PY'].astype(int)
df['Decade'] = (df['Year'] // 10) * 10
df['CR'] = df['CR'].astype(str)
Preprocess documents¶
In [11]:
import logging
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
logging.getLogger("transformers").setLevel(logging.WARNING)
modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]
config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]
README.md: 0.00B [00:00, ?B/s]
sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]
config.json: 0%| | 0.00/612 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]
Loading weights: 0%| | 0/103 [00:00<?, ?it/s]
tokenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]
vocab.txt: 0.00B [00:00, ?B/s]
tokenizer.json: 0.00B [00:00, ?B/s]
special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
config.json: 0%| | 0.00/190 [00:00<?, ?B/s]
In [12]:
publication_embeddings = sentence_model.encode(df['combined_text'].tolist(), show_progress_bar=True)
publication_embeddings_df = pd.DataFrame(publication_embeddings)
publication_embeddings_df['UT'] = df['UT'].tolist()
Batches: 0%| | 0/114 [00:00<?, ?it/s]
Conduct topic modeling¶
In [13]:
umap_model = UMAP(
n_neighbors=20,
n_components=2,
min_dist=0.00,
metric='cosine',
random_state=42
)
reduced_embeddings = umap_model.fit_transform(publication_embeddings_df.drop(columns='UT'))
reduced_embeddings_df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])
reduced_embeddings_df['UT'] = publication_embeddings_df['UT'].tolist()
hdbscan_model = HDBSCAN(
min_cluster_size=35,
min_samples=25,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
hdbscan_model.fit(reduced_embeddings)
labels = hdbscan_model.labels_
vectorizer_model = CountVectorizer(min_df=10)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()
topic_model = BERTopic(
embedding_model=sentence_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
calculate_probabilities=True,
verbose=True
)
topics, probs = topic_model.fit_transform(df['combined_text'])
2026-03-04 04:15:42,451 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 0%| | 0/114 [00:00<?, ?it/s]
2026-03-04 04:15:45,827 - BERTopic - Embedding - Completed ✓ 2026-03-04 04:15:45,828 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2026-03-04 04:16:10,753 - BERTopic - Dimensionality - Completed ✓ 2026-03-04 04:16:10,754 - BERTopic - Cluster - Start clustering the reduced embeddings 2026-03-04 04:16:11,031 - BERTopic - Cluster - Completed ✓ 2026-03-04 04:16:11,035 - BERTopic - Representation - Fine-tuning topics using representation models. 2026-03-04 04:16:12,160 - BERTopic - Representation - Completed ✓
In [14]:
topic_model.get_topic_info()
Out[14]:
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 1054 | -1_interventions_assessment_behavioral_behaviors | [interventions, assessment, behavioral, behavi... | [ONLINE INDIRECT GROUP TREATMENT FOR PRESCHOOL... |
| 1 | 0 | 391 | 0_autism_research_interventions_studies | [autism, research, interventions, studies, the... | [EVALUATION OF A TELEHEALTH ABA PROGRAM FOR CA... |
| 2 | 1 | 250 | 1_analyses_assessment_assessments_behaviors | [analyses, assessment, assessments, behaviors,... | [FUNCTIONAL ANALYSIS OF ABERRANT BEHAVIOR MAIN... |
| 3 | 2 | 214 | 2_monitoring_behavioral_design_activity | [monitoring, behavioral, design, activity, phy... | [A MOBILE SELF-CONTROL TRAINING APP TO IMPROVE... |
| 4 | 3 | 195 | 3_research_studies_study_interventions | [research, studies, study, interventions, desi... | [A MOTHER'S USE OF READING FLUENCY STRATEGIES:... |
| 5 | 4 | 191 | 4_interventions_studies_study_research | [interventions, studies, study, research, effe... | [EFFECTS OF AN IPAD-BASED SPEECH-GENERATING DE... |
| 6 | 5 | 170 | 5_interventions_effectiveness_study_behaviors | [interventions, effectiveness, study, behavior... | [COMPARISON OF LIVE MODELING AND VIDEO MODELIN... |
| 7 | 6 | 157 | 6_analyses_assessing_software_research | [analyses, assessing, software, research, visu... | [ANALYSIS OF RISK OF BIAS ASSESSMENTS IN A SAM... |
| 8 | 7 | 132 | 7_intervention_design_rehabilitation_therapy | [intervention, design, rehabilitation, therapy... | [FORCED USE OF THE UPPER EXTREMITY IN CEREBRAL... |
| 9 | 8 | 114 | 8_disabilities_effectiveness_study_disability | [disabilities, effectiveness, study, disabilit... | [COMPARING THE EFFECTIVENESS OF TWO VIDEO FADI... |
| 10 | 9 | 112 | 9_interventions_intervention_disabilities_skills | [interventions, intervention, disabilities, sk... | [VIDEO MODELING AND EXPLICIT INSTRUCTION: A CO... |
| 11 | 10 | 98 | 10_experiences_methodology_research_study | [experiences, methodology, research, study, ex... | [LEADING CHANGE: A MULTIPLE-CASE STUDY OF LEAD... |
| 12 | 11 | 80 | 11_interventions_behavioral_behaviors_behavior | [interventions, behavioral, behaviors, behavio... | [THE EFFECTS OF STUDENT- AND TEACHER-LED TOOTL... |
| 13 | 12 | 61 | 12_rehabilitation_study_procedures_studies | [rehabilitation, study, procedures, studies, p... | [USING A DIGITAL SPELLING AID TO IMPROVE WRITI... |
| 14 | 13 | 59 | 13_behavioral_methodology_development_research | [behavioral, methodology, development, researc... | [TRANSLATING BEHAVIOR ANALYSIS: A SPECTRUM RAT... |
| 15 | 14 | 58 | 14_autism_interventions_intervention_behaviors | [autism, interventions, intervention, behavior... | [EXPANDING COMMUNICATION MODALITIES AND FUNCTI... |
| 16 | 15 | 56 | 15_analyses_study_instructional_effectiveness | [analyses, study, instructional, effectiveness... | [EVALUATING THE USE OF VIDEO MODELING WITH VOI... |
| 17 | 16 | 54 | 16_study_impairments_impairment_disabilities | [study, impairments, impairment, disabilities,... | [ENABLING PEOPLE WITH INTELLECTUAL AND SENSORY... |
| 18 | 17 | 53 | 17_usability_effectiveness_efficacy_impairments | [usability, effectiveness, efficacy, impairmen... | [TEXT MESSAGES REDUCE MEMORY FAILURES IN ADULT... |
| 19 | 18 | 51 | 18_psychology_stimulus_tests_treatments | [psychology, stimulus, tests, treatments, test... | [TEACHING STATISTICAL VARIABILITY WITH EQUIVAL... |
| 20 | 19 | 45 | 19_stimulation_efficacy_questionnaire_visual | [stimulation, efficacy, questionnaire, visual,... | [EVALUATION OF THE OPTIMIZED PITCH AND LANGUAG... |
| 21 | 20 | 42 | 20_analyses_interventions_behavioral_skills | [analyses, interventions, behavioral, skills, ... | [BEHAVIORAL INTERVENTIONS TO IMPROVE PERFORMAN... |
Outlier Reduction¶
In [15]:
df["combined_text"] = (
df["combined_text"]
.str.replace(
r"(?i)\baugmentative and alternative communication\b",
"AAC",
regex=True
)
.str.replace(
r"(?i)\bapplied behavior analysis\b",
"ABA",
regex=True
)
.str.replace(
r"(?i)\bautism spectrum disorder\b",
"ASD",
regex=True
)
)
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
custom_stop_words = list(ENGLISH_STOP_WORDS - {"what"})
vectorizer_model = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1, 3), min_df=10)
topic_model.update_topics(df['combined_text'], vectorizer_model=vectorizer_model)
In [16]:
new_topics = topic_model.reduce_outliers(df['combined_text'], topics, strategy="c-tf-idf")
In [17]:
import logging
logging.getLogger("BERTopic").setLevel(logging.ERROR)
topic_model.update_topics(df['combined_text'], topics=new_topics, vectorizer_model=vectorizer_model)
logging.getLogger("BERTopic").setLevel(logging.WARNING)
In [18]:
topic_model.get_topic_info()
Out[18]:
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | 0 | 476 | 0_aba_parent_children_parents | [aba, parent, children, parents, telehealth, t... | [EVALUATION OF A TELEHEALTH ABA PROGRAM FOR CA... |
| 1 | 1 | 305 | 1_reinforcement_behavior_response_functional | [reinforcement, behavior, response, functional... | [FUNCTIONAL ANALYSIS OF ABERRANT BEHAVIOR MAIN... |
| 2 | 2 | 308 | 2_treatment_health_intervention_therapy | [treatment, health, intervention, therapy, stu... | [A MOBILE SELF-CONTROL TRAINING APP TO IMPROVE... |
| 3 | 3 | 211 | 3_reading_students_words_writing | [reading, students, words, writing, vocabulary... | [A MOTHER'S USE OF READING FLUENCY STRATEGIES:... |
| 4 | 4 | 245 | 4_aac_communication_speech_children | [aac, communication, speech, children, asd, au... | [EFFECTS OF AN IPAD-BASED SPEECH-GENERATING DE... |
| 5 | 5 | 283 | 5_video_modeling_video modeling_asd | [video, modeling, video modeling, asd, skills,... | [COMPARISON OF LIVE MODELING AND VIDEO MODELIN... |
| 6 | 6 | 198 | 6_data_case_single_single case | [data, case, single, single case, analysis, de... | [ANALYSIS OF RISK OF BIAS ASSESSMENTS IN A SAM... |
| 7 | 7 | 192 | 7_motor_subject_single_design | [motor, subject, single, design, rehabilitatio... | [FORCED USE OF THE UPPER EXTREMITY IN CEREBRAL... |
| 8 | 8 | 207 | 8_intellectual_disabilities_students_video | [intellectual, disabilities, students, video, ... | [COMPARING THE EFFECTIVENESS OF TWO VIDEO FADI... |
| 9 | 9 | 132 | 9_students_mathematics_solving_virtual | [students, mathematics, solving, virtual, lear... | [VIDEO MODELING AND EXPLICIT INSTRUCTION: A CO... |
| 10 | 10 | 141 | 10_design_learning_research_education | [design, learning, research, education, study,... | [LEADING CHANGE: A MULTIPLE-CASE STUDY OF LEAD... |
| 11 | 11 | 151 | 11_behavior_classroom_game_disruptive | [behavior, classroom, game, disruptive, studen... | [THE EFFECTS OF STUDENT- AND TEACHER-LED TOOTL... |
| 12 | 12 | 82 | 12_treatment_production_language_speech | [treatment, production, language, speech, ther... | [USING A DIGITAL SPELLING AID TO IMPROVE WRITI... |
| 13 | 13 | 89 | 13_behavior_behavior analysis_analysis_aba | [behavior, behavior analysis, analysis, aba, b... | [TRANSLATING BEHAVIOR ANALYSIS: A SPECTRUM RAT... |
| 14 | 14 | 124 | 14_social_peer_peers_asd | [social, peer, peers, asd, children, skills, i... | [EXPANDING COMMUNICATION MODALITIES AND FUNCTI... |
| 15 | 15 | 152 | 15_training_staff_feedback_skills | [training, staff, feedback, skills, video, ski... | [EVALUATING THE USE OF VIDEO MODELING WITH VOI... |
| 16 | 16 | 80 | 16_technology_participants_disabilities_intell... | [technology, participants, disabilities, intel... | [ENABLING PEOPLE WITH INTELLECTUAL AND SENSORY... |
| 17 | 17 | 67 | 17_memory_brain_brain injury_injury | [memory, brain, brain injury, injury, cognitiv... | [TEXT MESSAGES REDUCE MEMORY FAILURES IN ADULT... |
| 18 | 18 | 77 | 18_relations_instruction_stimulus_students | [relations, instruction, stimulus, students, c... | [TEACHING STATISTICAL VARIABILITY WITH EQUIVAL... |
| 19 | 19 | 51 | 19_hearing_speech_recognition_patients | [hearing, speech, recognition, patients, subje... | [EVALUATION OF THE OPTIMIZED PITCH AND LANGUAG... |
| 20 | 20 | 66 | 20_performance_matching_feedback_behavioral | [performance, matching, feedback, behavioral, ... | [BEHAVIORAL INTERVENTIONS TO IMPROVE PERFORMAN... |
Hierarchical Clustering¶
In [19]:
hierarchical_topics_init = topic_model.hierarchical_topics(df['combined_text'])
fig_hierarchical_init = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics_init)
fig_hierarchical_init.update_layout(
width=1000,
height=800,
margin=dict(t=60, b=60, l=120, r=60),
plot_bgcolor='white',
paper_bgcolor='white',
font_size=14,
title="Hierarchical Topic Clustering"
)
fig_hierarchical_init.update_layout(width=None, height=None, autosize=True)
fig_hierarchical_init.write_html(
"results/fig_hierarchical_init.html",
config={"responsive": True}
)
# fig_hierarchical_init.show()
100%|██████████| 20/20 [00:00<00:00, 284.56it/s]
In [20]:
IFrame(src='results/fig_hierarchical_init.html', width=1000, height=800)
Out[20]:
In [21]:
topics_to_merge = [[2, 7],
[6, 10],
[1, 13],
[5, 8],
[12, 19],
[15, 20]]
topic_model.merge_topics(df['combined_text'], topics_to_merge)
In [22]:
hierarchical_topics = topic_model.hierarchical_topics(df['combined_text'])
fig_hierarchical = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
fig_hierarchical.update_layout(
width=1000,
height=800,
margin=dict(t=60, b=60, l=120, r=60),
plot_bgcolor='white',
paper_bgcolor='white',
font_size=14,
title="Hierarchical Topic Clustering"
)
fig_hierarchical.update_layout(width=None, height=None, autosize=True)
fig_hierarchical.write_html(
"results/fig_hierarchical.html",
config={"responsive": True}
)
# fig_hierarchical.show()
100%|██████████| 14/14 [00:00<00:00, 286.07it/s]
In [23]:
IFrame(src='results/fig_hierarchical.html', width=1000, height=800)
Out[23]:
In [24]:
document_info = topic_model.get_document_info(df["combined_text"])
df["Topic"] = document_info["Topic"] + 1
topic_info = topic_model.get_topic_info()
topic_info_gen = generate_topic_labels(topic_info, topic_model)
topic_model.set_topic_labels(topic_info_gen['GenName'].tolist())
for i in range(probs.shape[1]):
df[f'Prob_Topic_{i}'] = probs[:, i]
data = []
for topic in topic_info['Topic']:
words_with_scores = topic_model.get_topic(topic)
for word, score in words_with_scores:
data.append({"Topic": topic, "Word": word, "Score": score})
topic_model_words = pd.DataFrame(data)
topic_info_words = topic_info.merge(topic_model_words, how="left", on="Topic") \
.sort_values(by=['Topic', 'Score'], ascending=[True, False])
topic_info['Topic'] = topic_info['Topic'] + 1
topic_info_words['Topic'] = topic_info_words['Topic'] + 1
In [25]:
topic_info_gen
Out[25]:
| Topic | Count | Name | Representation | Representative_Docs | GenName | |
|---|---|---|---|---|---|---|
| 0 | 1 | 500 | 0_study_intervention_treatment_design | [study, intervention, treatment, design, singl... | [EFFECTIVENESS OF TRAUMA-FOCUSED ART THERAPY (... | Single-Case Health Intervention Study |
| 1 | 2 | 490 | 1_video_skills_modeling_video modeling | [video, skills, modeling, video modeling, inte... | [EFFECTS OF VIDEO MODELING WITH VIDEO FEEDBACK... | Video Modeling for Social Skills in Children w... |
| 2 | 3 | 476 | 2_aba_children_parent_training | [aba, children, parent, training, parents, tel... | [TELEHEALTH AND AUTISM: TREATING CHALLENGING B... | Telehealth Interventions for Children with Autism |
| 3 | 4 | 394 | 3_behavior_reinforcement_response_analysis | [behavior, reinforcement, response, analysis, ... | [THE EFFECT OF RULES ON DIFFERENTIAL REINFORCE... | Behavioral Analysis and Intervention Strategies |
| 4 | 5 | 339 | 4_data_design_case_single | [data, design, case, single, analysis, single ... | [WHEN THE TRUTH HITS YOU BETWEEN THE EYES A SO... | Single-Case Visual Data Analysis |
| 5 | 6 | 245 | 5_aac_communication_speech_children | [aac, communication, speech, children, asd, au... | [TRANSITIONING FROM A LOW- TO HIGH-TECH AAC (A... | Augmentative Communication for Children with A... |
| 6 | 7 | 218 | 6_training_feedback_staff_performance | [training, feedback, staff, performance, skill... | [THE USE OF BEHAVIORAL SKILLS TRAINING TO TEAC... | Staff Training and Performance Enhancement |
| 7 | 8 | 211 | 7_reading_students_words_instruction | [reading, students, words, instruction, vocabu... | [TECHNOLOGY-ASSISTED READING FLUENCY INTERVENT... | Reading Instruction and Vocabulary Development |
| 8 | 9 | 151 | 8_behavior_classroom_game_disruptive | [behavior, classroom, game, disruptive, studen... | [EVALUATION OF THE GOOD BEHAVIOR GAME USING CL... | Classroom Behavior and Management Strategies |
| 9 | 10 | 133 | 9_treatment_hearing_speech_study | [treatment, hearing, speech, study, single, re... | [USING TREATMENT TO IMPROVE THE PRODUCTION OF ... | Speech and Hearing Therapy Research |
| 10 | 11 | 132 | 10_students_learning_solving_virtual | [students, learning, solving, virtual, instruc... | [USING A VIRTUAL NUMBER LINE AND CORRECTIVE FE... | Instructional Strategies for Students with Dis... |
| 11 | 12 | 124 | 11_social_peer_asd_peers | [social, peer, asd, peers, children, skills, i... | [INTERACTION AMONG PRESCHOOLERS WITH AND WITHO... | Social Interaction Skills for Children with ASD |
| 12 | 13 | 80 | 12_technology_participants_disabilities_multiple | [technology, participants, disabilities, multi... | [A SMARTPHONE-BASED PROGRAM ENABLING PEOPLE WI... | Smartphone Technology for People with Intellec... |
| 13 | 14 | 77 | 13_instruction_relations_stimulus_students | [instruction, relations, stimulus, students, c... | [USING STIMULUS EQUIVALENCE-BASED INSTRUCTION ... | Programmed Instruction for College Students |
| 14 | 15 | 67 | 14_brain_injury_cognitive_rehabilitation | [brain, injury, cognitive, rehabilitation, int... | [EFFICACY OF ELECTRONIC PORTABLE ASSISTIVE DEV... | Cognitive Rehabilitation for Brain Injury |
In [26]:
topic_info.to_excel("files/topic_info.xlsx", engine='openpyxl', index=False)
topic_info_words.to_excel("files/topic_info_words.xlsx", engine='openpyxl', index=False)
df.to_excel("files/df.xlsx", engine='openpyxl', index=False)
In [27]:
# topic_info = pd.read_excel("files/topic_info.xlsx")
# topic_info_words = pd.read_excel("files/topic_info_words.xlsx")
# df = pd.read_excel("files/df.xlsx")
topic_info_human_loop = pd.read_excel("files/topic_info_human_loop.xlsx")
In [28]:
topic_info_human_loop
Out[28]:
| Topic | Count | Name | Representation | Representative_Docs | GenName | CustomLabel | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 500 | 0_study_intervention_treatment_design | ['study', 'intervention', 'treatment', 'design... | ['EFFECTIVENESS OF TRAUMA-FOCUSED ART THERAPY ... | Single-Case Health Intervention Study | Health Interventions |
| 1 | 2 | 490 | 1_video_skills_modeling_video modeling | ['video', 'skills', 'modeling', 'video modelin... | ["EFFECTS OF VIDEO MODELING WITH VIDEO FEEDBAC... | Video Modeling for Social Skills in Children w... | Video modeling |
| 2 | 3 | 476 | 2_aba_children_parent_training | ['aba', 'children', 'parent', 'training', 'par... | ['TELEHEALTH AND AUTISM: TREATING CHALLENGING ... | Telehealth Interventions for Children with Autism | Telehealth services |
| 3 | 4 | 394 | 3_behavior_reinforcement_response_analysis | ['behavior', 'reinforcement', 'response', 'ana... | ['THE EFFECT OF RULES ON DIFFERENTIAL REINFORC... | Behavioral Analysis and Intervention Strategies | Function-based interventions |
| 4 | 5 | 339 | 4_data_design_case_single | ['data', 'design', 'case', 'single', 'analysis... | ['WHEN THE TRUTH HITS YOU BETWEEN THE EYES A S... | Visual Analysis in Single-Case Research | Single-case design analysis |
| 5 | 6 | 245 | 5_aac_communication_speech_children | ['aac', 'communication', 'speech', 'children',... | ["TRANSITIONING FROM A LOW- TO HIGH-TECH AAC (... | Augmentative Communication for Children with A... | AAC interventions |
| 6 | 7 | 218 | 6_training_feedback_staff_performance | ['training', 'feedback', 'staff', 'performance... | ["THE USE OF BEHAVIORAL SKILLS TRAINING TO TEA... | Staff Training and Performance Enhancement | Staff training |
| 7 | 8 | 211 | 7_reading_students_words_instruction | ['reading', 'students', 'words', 'instruction'... | ['TECHNOLOGY-ASSISTED READING FLUENCY INTERVEN... | Reading Instruction and Vocabulary Development | Reading and writing instruction |
| 8 | 9 | 151 | 8_behavior_classroom_game_disruptive | ['behavior', 'classroom', 'game', 'disruptive'... | ["EVALUATION OF THE GOOD BEHAVIOR GAME USING C... | Classroom Behavior and Management Strategies | Classroom management |
| 9 | 10 | 133 | 9_treatment_hearing_speech_study | ['treatment', 'hearing', 'speech', 'study', 's... | ['USING TREATMENT TO IMPROVE THE PRODUCTION OF... | Speech and Hearing Therapy Research | Speech and language therapy |
| 10 | 11 | 132 | 10_students_learning_solving_virtual | ['students', 'learning', 'solving', 'virtual',... | ['USING A VIRTUAL NUMBER LINE AND CORRECTIVE F... | Instructional Strategies for Students with Dis... | Mathematics instruction |
| 11 | 12 | 124 | 11_social_peer_asd_peers | ['social', 'peer', 'asd', 'peers', 'children',... | ["INTERACTION AMONG PRESCHOOLERS WITH AND WITH... | Social Interaction Skills for Children with ASD | Social skills interventions |
| 12 | 13 | 80 | 12_technology_participants_disabilities_multiple | ['technology', 'participants', 'disabilities',... | ["A SMARTPHONE-BASED PROGRAM ENABLING PEOPLE W... | Smartphone Use by People with Intellectual Dis... | Assistive technology |
| 13 | 14 | 77 | 13_instruction_relations_stimulus_students | ['instruction', 'relations', 'stimulus', 'stud... | ['USING STIMULUS EQUIVALENCE-BASED INSTRUCTION... | Programmed Instruction for College Students | Computer-based instruction |
| 14 | 15 | 67 | 14_brain_injury_cognitive_rehabilitation | ['brain', 'injury', 'cognitive', 'rehabilitati... | ['EFFICACY OF ELECTRONIC PORTABLE ASSISTIVE DE... | Cognitive Rehabilitation for Brain Injury | Cognitive rehabilitation |
In [29]:
#topic_info = topic_info.drop(columns=['Topic'])
#topic_info_concat = pd.merge(topic_info, topic_info_human_loop[['Name', 'Topic', 'CustomLabel']], how="left", on="Name")
topic_info_concat = pd.merge(topic_info, topic_info_human_loop[['Name', 'CustomLabel']], how="left", on="Name")
order = ['Topic', 'Count', 'Name', 'Representation', 'GenName', 'CustomLabel']
topic_info_concat = topic_info_concat.reindex(columns=order)
#topic_info_words = topic_info_words.drop(columns=['Topic'])
#topic_info_words_concat = pd.merge(topic_info_words, topic_info_human_loop[['Name', 'Topic', 'CustomLabel']], how="left", on="Name")
topic_info_words_concat = pd.merge(topic_info_words, topic_info_human_loop[['Name', 'CustomLabel']], how="left", on="Name")
order = ['Topic', 'Count', 'Name', 'Representation', 'Word', 'Score', 'Representative_Docs', 'GenName', 'CustomLabel']
topic_info_words_concat = topic_info_words_concat.reindex(columns=order)
In [30]:
files = {
"files/topic_model.pkl": topic_model,
"files/publication_embeddings.pkl": publication_embeddings,
"files/publication_embeddings_df.pkl": publication_embeddings_df,
"files/reduced_embeddings.pkl": reduced_embeddings,
"files/reduced_embeddings_df.pkl": reduced_embeddings_df,
"files/topic_info.pkl": topic_info,
"files/topic_info_concat.pkl": topic_info_concat,
"files/topic_info_words.pkl": topic_info_words,
"files/topic_info_words_concat.pkl": topic_info_words_concat,
"files/df.pkl": df
}
for filename, data in files.items():
if data is None:
continue
try:
with open(filename, "wb") as f:
pickle.dump(data, f)
except Exception as e:
print(f"Error saving pickle for {filename}: {e}")
try:
excel_filename = filename.rsplit('.', 1)[0] + ".xlsx"
if isinstance(data, pd.DataFrame):
data.to_excel(excel_filename, index=False)
elif isinstance(data, np.ndarray) and data.ndim in [1, 2]:
pd.DataFrame(data).to_excel(excel_filename, index=False)
except Exception as e:
print(f"Error saving Excel for {filename}: {e}")
In [ ]:
#def load_data_from_files(files_dict):
# loaded_data = {}
# for filename, var_name in files_dict.items():
# filepath = os.path.join('files', filename)
# if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
# try:
# with open(filepath, "rb") as f:
# loaded_data[var_name] = pickle.load(f)
# except Exception as e:
# loaded_data[var_name] = None
# print(f"Error loading {filepath}: {e}")
# else:
# loaded_data[var_name] = None
# return loaded_data
#files_to_load = {
# "topic_model.pkl": "topic_model",
# "publication_embeddings.pkl": "publication_embeddings",
# "publication_embeddings_df.pkl": "publication_embeddings_df",
# "reduced_embeddings.pkl": "reduced_embeddings",
# "reduced_embeddings_df.pkl": "reduced_embeddings_df",
# "topic_info.pkl": "topic_info",
# "topic_info_concat.pkl": "topic_info_concat",
# "topic_info_words.pkl": "topic_info_words",
# "topic_info_words_concat.pkl": "topic_info_words_concat",
# "df.pkl": "df"
#}
#loaded_data = load_data_from_files(files_to_load)
#topic_model = loaded_data.get("topic_model")
#publication_embeddings = loaded_data.get("publication_embeddings")
#publication_embeddings_df = loaded_data.get("publication_embeddings_df")
#reduced_embeddings = loaded_data.get("reduced_embeddings")
#reduced_embeddings_df = loaded_data.get("reduced_embeddings_df")
#topic_info = loaded_data.get("topic_info")
#topic_info_concat = loaded_data.get("topic_info_concat")
#topic_info_words = loaded_data.get("topic_info_words")
#topic_info_words_concat = loaded_data.get("topic_info_words_concat")
#df = loaded_data.get("df")
Identify representative words and scores for each topic¶
In [31]:
topic_word_data = topic_info_words_concat.copy()
topic_word_data['Word (c-TF-IDF)'] = topic_word_data.apply(
lambda row: f"{row['Word']} ({row['Score']:.3f})", axis=1
)
topic_word_data = topic_word_data.sort_values(by=['CustomLabel', 'Score'], ascending=[True, False])
topic_word_table = topic_word_data.groupby('CustomLabel', sort=False).agg(
Topic=('Topic', 'first'),
Count=('Count', 'first'),
**{'Word (c-TF-IDF)': ('Word (c-TF-IDF)', lambda x: ', '.join(x))}
).reset_index()[['Topic', 'CustomLabel', 'Count', 'Word (c-TF-IDF)']]
topic_word_table = topic_word_table.sort_values(by='Topic')
styles = [
dict(selector="caption", props=[("caption-side", "top")]),
{'selector': 'th', 'props': [('text-align', 'center')]},
{'selector': 'td', 'props': [('text-align', 'left')]}
]
topic_word_table = (
topic_word_table.style
.set_caption("<b>Topic Word Score</b>")
.set_table_styles(styles)
.set_properties(**{
'font-size': '24px',
'font-family': 'Helvetica Neue',
'color': 'black',
'text-align': 'center'
})
.set_properties(**{
'text-align': 'center',
'font-size': '14px',
'font-family': 'Helvetica Neue',
'color': 'black',
}, subset=pd.IndexSlice[:, topic_word_table.columns])
.hide(axis='index')
)
In [32]:
topic_word_table
Out[32]:
| Topic | CustomLabel | Count | Word (c-TF-IDF) |
|---|---|---|---|
| 1 | Health Interventions | 500 | study (0.028), intervention (0.027), treatment (0.026), design (0.026), single (0.025), therapy (0.025), participants (0.023), baseline (0.022), case (0.021), health (0.019) |
| 2 | Video modeling | 490 | video (0.063), skills (0.045), modeling (0.041), video modeling (0.035), intellectual (0.030), asd (0.029), children (0.028), autism (0.028), students (0.027), social (0.026) |
| 3 | Telehealth services | 476 | aba (0.053), children (0.034), parent (0.033), training (0.032), parents (0.032), telehealth (0.031), asd (0.028), intervention (0.027), autism (0.027), behavior (0.023) |
| 4 | Function-based interventions | 394 | behavior (0.077), reinforcement (0.077), response (0.038), analysis (0.038), functional (0.033), functional analysis (0.031), automatic (0.029), treatment (0.029), problem (0.027), problem behavior (0.027) |
| 5 | Single-case design analysis | 339 | data (0.049), design (0.048), case (0.045), single (0.040), analysis (0.038), single case (0.037), research (0.034), learning (0.026), designs (0.025), visual (0.024) |
| 6 | AAC interventions | 245 | aac (0.073), communication (0.070), speech (0.045), children (0.037), asd (0.033), autism (0.027), intervention (0.027), picture (0.024), participants (0.022), studies (0.022) |
| 7 | Staff training | 218 | training (0.080), feedback (0.053), staff (0.046), performance (0.042), skills (0.035), behavioral (0.032), video (0.032), skills training (0.028), participants (0.025), teaching (0.021) |
| 8 | Reading and writing instruction | 211 | reading (0.100), students (0.053), words (0.037), instruction (0.032), vocabulary (0.032), word (0.032), intervention (0.028), learning (0.027), fluency (0.025), study (0.024) |
| 9 | Classroom management | 151 | behavior (0.085), classroom (0.063), game (0.054), disruptive (0.052), students (0.048), good (0.041), student (0.036), group (0.036), management (0.035), teachers (0.035) |
| 10 | Speech and language therapy | 133 | treatment (0.062), hearing (0.051), speech (0.049), study (0.028), single (0.022), results (0.022), language (0.022), therapy (0.022), design (0.022), subject (0.022) |
| 11 | Mathematics instruction | 132 | students (0.092), learning (0.039), solving (0.039), virtual (0.038), instruction (0.038), disabilities (0.032), problems (0.032), intervention (0.028), problem solving (0.026), study (0.026) |
| 12 | Social skills interventions | 124 | social (0.091), peer (0.061), asd (0.049), peers (0.047), children (0.045), skills (0.035), interactions (0.033), intervention (0.031), interaction (0.029), communication (0.028) |
| 13 | Assistive technology | 80 | technology (0.058), participants (0.055), disabilities (0.048), multiple (0.040), intellectual (0.039), people (0.038), study (0.033), activity (0.031), smartphone (0.029), persons (0.028) |
| 14 | Computer-based instruction | 77 | instruction (0.077), relations (0.077), stimulus (0.053), students (0.040), college (0.037), programmed (0.035), participants (0.034), based (0.033), classes (0.033), based instruction (0.033) |
| 15 | Cognitive rehabilitation | 67 | brain (0.068), injury (0.063), cognitive (0.039), rehabilitation (0.036), intervention (0.032), participants (0.030), single (0.027), case (0.026), study (0.025), single case (0.025) |
Visualize the semantic embedding space of documents by topic¶
In [33]:
color_map = [
'#EF5350',
'#42A5F5',
'#66BB6A',
'#FFA726',
'#AB47BC',
'#26A69A',
'#FFEE58',
'#FF8A65',
'#EC407A',
'#90A4AE',
'#C0CA33',
'#4DD0E1',
'#A1887F',
'#CE93D8',
'#7986CB'
]
docs_topics_data = pd.merge(df, topic_info_concat[['Topic', 'CustomLabel']], on='Topic', how='left')
docs_topics_data['Topic'] = pd.to_numeric(docs_topics_data['Topic'], errors='coerce')
docs_topics_data = pd.merge(docs_topics_data, reduced_embeddings_df[['UT', 'x', 'y']], on='UT', how='left')
docs_topics_data['x'] = pd.to_numeric(docs_topics_data['x'], errors='coerce')
docs_topics_data['y'] = pd.to_numeric(docs_topics_data['y'], errors='coerce')
docs_topics_data = docs_topics_data.dropna(subset=['Topic', 'x', 'y'])
#docs_topics_data = df.copy()
docs_topics_data['Topic'] = docs_topics_data['Topic'].astype(int)
ordered_legend = docs_topics_data.sort_values('Topic')['CustomLabel'].unique().tolist()
ordered_legend = [x for x in ordered_legend if pd.notnull(x)]
docs_topics_data['legend_topic'] = pd.Categorical(docs_topics_data['CustomLabel'],
categories=ordered_legend,
ordered=True)
In [34]:
docs_topics_data['hover_text'] = (
"Title: " +
docs_topics_data['TI'].apply(lambda text: "<br>".join(textwrap.wrap(text, width=50))) +
"<br><br>Topic " + docs_topics_data['Topic'].astype(str) + ": " + docs_topics_data['CustomLabel']
)
color_mapping = {cat: color_map[i % len(color_map)] for i, cat in enumerate(ordered_legend)}
fig_cluster = px.scatter(
docs_topics_data,
x='x',
y='y',
color='legend_topic',
color_discrete_map=color_mapping,
labels={'x': 'X', 'y': 'Y'},
custom_data=['hover_text'],
category_orders={'legend_topic': ordered_legend}
)
fig_cluster.update_traces(
marker=dict(size=10, opacity=0.9, line=dict(width=1, color="white")),
selector=dict(mode='markers'),
hovertemplate='<b>%{customdata[0]}</b><extra></extra>'
)
fig_cluster.update_traces(
hoverlabel=dict(font_size=16)
)
fig_cluster.update_layout(
title="<b>Semantic Space of Documents by Topic</b>",
title_x=0.5,
title_font=dict(size=24, family="Helvetica Neue", color="black"),
margin=dict(t=80, b=80, l=80, r=80),
plot_bgcolor='white',
paper_bgcolor='white',
width=900,
height=800,
showlegend=True,
legend=dict(
orientation="h",
yanchor="top",
y=-0.15,
xanchor="center",
x=0.5,
title_text="",
font=dict(size=14),
title_font=dict(size=16)
)
)
fig_cluster.update_layout(width=None, height=None, autosize=True)
fig_cluster.write_html(
"results/fig_cluster.html",
config={"responsive": True}
)
# fig_cluster.show()
In [35]:
IFrame(src='results/fig_cluster.html', width=900, height=1000)
Out[35]:
Calculate cosine similarity among topics¶
In [36]:
merged_df = pd.merge(df, publication_embeddings_df, on='UT')
embeddings = [col for col in publication_embeddings_df.columns if col != 'UT']
topic_embeddings = merged_df.groupby("Topic")[embeddings].mean()
cosine_sim_matrix = cosine_similarity(topic_embeddings)
cosine_sim_df = pd.DataFrame(
cosine_sim_matrix,
index=topic_embeddings.index,
columns=topic_embeddings.index
)
topic_to_label = topic_info_concat.set_index('Topic')['CustomLabel'].to_dict()
hover_text = []
for i, row_topic in enumerate(cosine_sim_df.index):
row_hover = []
for j, col_topic in enumerate(cosine_sim_df.columns):
row_label = topic_to_label.get(row_topic, f"Topic {row_topic}")
col_label = topic_to_label.get(col_topic, f"Topic {col_topic}")
similarity = cosine_sim_df.iloc[i, j]
hover = f'Topic {row_topic}: {row_label}<br>Topic {col_topic}: {col_label}<br>Cosine Similarity: {similarity:.3f}'
row_hover.append(hover)
hover_text.append(row_hover)
fig_cosine_sim = px.imshow(
cosine_sim_df,
color_continuous_scale="YlGnBu",
origin="lower",
labels=dict(color="Similarity"),
x=cosine_sim_df.columns,
y=cosine_sim_df.index,
text_auto=True
)
fig_cosine_sim.update_xaxes(
tickmode='array',
tickvals=cosine_sim_df.columns,
ticktext=[str(topic) for topic in cosine_sim_df.columns],
tickfont=dict(size=13)
)
fig_cosine_sim.update_yaxes(
tickmode='array',
tickvals=cosine_sim_df.index,
ticktext=[str(topic) for topic in cosine_sim_df.index],
tickfont=dict(size=13)
)
fig_cosine_sim.update_traces(
texttemplate='%{z:.2f}',
textfont=dict(size=13),
customdata=hover_text,
hovertemplate='%{customdata}<extra></extra>',
hoverlabel=dict(font_size=13)
)
fig_cosine_sim.update_layout(
title="<b>Cosine Similarity Among Topics</b>",
title_x=0.5,
title_font=dict(size=24, family="Helvetica Neue", color="black"),
margin=dict(t=80, b=80, l=80, r=80),
plot_bgcolor='white',
paper_bgcolor='white',
width=900,
height=900,
showlegend=True,
legend_title_text="Topic",
legend_title_font=dict(size=13),
legend_font=dict(size=13),
font=dict(size=13)
)
x_dom = fig_cosine_sim.layout.xaxis.domain
x_center = (x_dom[0] + x_dom[1]) / 2
x_len = 1 * (x_dom[1] - x_dom[0])
fig_cosine_sim.update_layout(
coloraxis_colorbar=dict(
orientation='h',
x=x_center,
xanchor='center',
y=-0.13,
yanchor='top',
len=x_len,
thickness=12
)
)
fig_cosine_sim.update_layout(width=None, height=None, autosize=True)
fig_cosine_sim.write_html(
"results/fig_cosine_sim.html",
config={"responsive": True}
)
# fig_cosine_sim.show()
In [37]:
IFrame(src='results/fig_cosine_sim.html', width=900, height=900)
Out[37]:
Ngrams¶
In [38]:
vectorizer = vectorizer_model.build_analyzer()
df = docs_topics_data.copy()
def extract_ngrams(text):
all_ngrams = vectorizer(text)
unigrams = [ng for ng in all_ngrams if len(ng.split()) == 1]
bigrams = [ng for ng in all_ngrams if len(ng.split()) == 2]
trigrams = [ng for ng in all_ngrams if len(ng.split()) == 3]
return unigrams, bigrams, trigrams
df['unigrams'], df['bigrams'], df['trigrams'] = zip(*df['combined_text'].apply(extract_ngrams))
In [39]:
with open("files/df.pkl", "wb") as f_df:
pickle.dump(df, f_df)
df.to_excel("files/df.xlsx", engine='openpyxl', index=False)
topic_word_table.to_excel("files/topic_word_table.xlsx", engine='openpyxl', index=False)
cosine_sim_df.to_excel("files/cosine_sim_df.xlsx", engine='openpyxl', index=False)
In [ ]:
# with open("files/df.pkl", "rb") as f_df:
# df = pickle.load(f_df)
# df = pd.read_excel("files/df.xlsx")
# topic_word_table = pd.read_excel("files/topic_word_table.xlsx")
# cosine_sim_df = pd.read_excel("files/cosine_sim_df.xlsx")
In [41]:
from nbconvert import HTMLExporter
import nbformat
notebook_path = 'index.ipynb'
html_exporter = HTMLExporter()
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
notebook_content = nb_file.read()
notebook = nbformat.reads(notebook_content, as_version=4)
if 'widgets' in notebook.metadata and 'application/vnd.jupyter.widget-state+json' in notebook.metadata['widgets']:
if 'state' not in notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']:
notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']['state'] = {}
html_output, _ = html_exporter.from_notebook_node(notebook)
with open('index.html', 'w', encoding='utf-8') as html_file:
html_file.write(html_output)