Publication Screening on Methodology¶
In [1]:
from datetime import datetime
date = datetime.now()
formatted_date = date.strftime("%B %d, %Y")
print(formatted_date)
February 28, 2026
0. Configuring the System Environment¶
In [2]:
# Check GPU memory
!nvidia-smi
Sat Feb 28 17:49:54 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.82.07 Driver Version: 580.82.07 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA A100-SXM4-40GB Off | 00000000:00:04.0 Off | 0 | | N/A 32C P0 46W / 400W | 0MiB / 40960MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+
In [3]:
# Check system RAM
!free -h
total used free shared buff/cache available Mem: 83Gi 855Mi 78Gi 2.0Mi 3.7Gi 81Gi Swap: 0B 0B 0B
In [4]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
userdata.get('HF_TOKEN')
# Update this path to your project directory in Google Drive
%cd /content/drive/My\ Drive/Colab\ Notebooks/LLM/sped_biblio/methodology
Mounted at /content/drive /content/drive/My Drive/Colab Notebooks/LLM/sped_biblio/methodology
2. Screening Studies on Methodology¶
In [5]:
import pandas as pd
import re
In [6]:
all_data_file = f"files/all_data_technology_use.xlsx"
all_data = pd.read_excel(all_data_file, na_filter=False)
# Combine text columns
text_cols = ['TI', 'AB', 'DE']
all_data['combined_text'] = all_data[text_cols].astype(str).agg(' '.join, axis=1)
In [7]:
patterns = sorted(set([
r'\banaly\w*\b', # "analyze", "analysis"
r'\bartificial\w*[-\s]+intelligen\w*\b', # "artificial intelligence", "artificial-intelligence"
r'\bbehavior\w*[-\s]+chang\w*\b|\bchang\w*[-\s]*behavior\w*\b', # "behavior change", "change behavior"
r'\bbehavior\w*\b', # "behavior", "behaviors"
r'\bcollect\w*[-\s]+data\b|\bdata[-\s]*collect\w*\b', # "collect data", "data collection"
r'\bconstruct\w*[-\s]+graph\w*\b|\bgraph[-\s]*construct\w*\b', # "construct graph", "graph construct"
r'\bcreate\w*[-\s]+graph\w*\b|\bgraph[-\s]*create\w*\b', # "create graph", "graph create"
r'\bdata[-\s]+analy\w*\b|\banaly\w*[-\s]*data\b', # "data analysis", "analyze data"
r'\bdata[-\s]*based\b', # "data-based", "databased"
r'\bdata[-\s]*driven\b', # "data-driven", "data driven"
r'\bdata[-\s]*extract\w*\b|\bextract\w*[-\s]+data\b', # "data extraction", "extract data"
r'\bdata[-\s]*gather\w*\b|\bgather\w*[-\s]*data\b', # "data gathering", "gather data"
r'\bdata[-\s]*informed\b', # "data-informed", "data informed"
r'\bdata[-\s]*record\w*\b|\brecord\w*[-\s]+data\b', # "data recording", "record data"
r'\bdata[-\s]*retriev\w*\b|\bretriev\w*[-\s]*data\b', # "data retrieving", "retrieve data"
r'\bdata[-\s]*shar\w*\b|\bshar\w*[-\s]+data\b', # "data sharing", "share data"
r'\bdecision\w*[-\s]*mak\w*\b|\bmak\w*[-\s]*decision\w*\b', # "decision-making", "make decisions"
r'\bdependent[-\s]+variable\w*\b', # "dependent variable", "dependent variables"
r'\bdesign\w*\b', # "design", "designing"
r'\bdual\w*[-\s]+criteri\w*\b', # "dual criteria", "dual-criterion"
r'\beffect\w*[-\s]+size\w*\b', # "effect size", "effect sizes"
r'\bethic\w*\b', # "ethic", "ethics"
r'\bevidence[-\s]*base\w*\b', # "evidence-based", "evidencebased"
r'\bevidence\w*\b', # "evidence", "evidences"
r'\bexamin\w*\b', # "examine", "examination"
r'\bexperiment\w*\b', # "experiment", "experimental"
r'\bfunction\w*[-\s]+relation\w*\b', # "functional relationship", "function-related"
r'\bgeneraliz\w*\b', # "generalize", "generalization"
r'\bindependent[-\s]+variable\w*\b', # "independent variable", "independent variables"
r'\binspect\w*[-\s]+visual\w*\b|\bvisual\w*[-\s]+inspect\w*\b', # "inspect visually", "visual inspection"
r'\binterrupted[-\s]+time[-\s]+seri\w*\b', # "interrupted time series", "interrupted-timeseries"
r'\blevel[-\s]*chang\w*\b|\bchang\w*[-\s]*level\b', # "level change", "change level"
r'\bmachine\w*[-\s]+learn\w*\b', # "machine learning", "machine-learning"
r'\bmaintain\w*\b', # "maintain", "maintenance"
r'\bmeasure\w*\b', # "measure", "measurement"
r'\bmeta\w*[-\s]*analys\w*\b', # "meta-analysis", "meta analysis"
r'\bmethod\w*\b', # "method", "methodology"
r'\bmodel\w*\b', # "model", "modeling"
r'\bmoment[-\s]+to[-\s]+moment\b', # "moment-to-moment", "moment to moment"
r'\bmulti\w*level\w*\b', # "multilevel", "multi-level"
r'\bN[-\s]*of[-\s]*1\w*\b', # "N-of-1", "N of 1"
r'\bnetwork\w*\b', # "network", "networking"
r'\bnon\w*parametric\w*\b', # "non-parametric", "nonparametric"
r'\bopen[-\s]+science\b', # "open science", "open-science"
r'\boverlap\w*\b', # "overlap", "overlapping"
r'\bparametric\w*\b', # "parametric", "parametrics"
r'\bphase\w*\b', # "phase", "phases"
r'\bpre\w*[-\s]*register\w*\b', # "pre-register", "preregister"
r'\bPRISMA\b', # "PRISMA", "prisma"
r'\bprocedure\w*\b', # "procedure", "procedures"
r'\bprocess\w*\b', # "process", "processes"
r'\bprompt\w*\b', # "prompt", "prompting"
r'\bprotocol\b', # "protocol", "protocols"
r'\bqualit\w*[-\s]+indicator\w*\b', # "quality indicator", "quality-indicators"
r'\bquantif\w*\b', # "quantify", "quantification"
r'\brandomiz\w*\b', # "randomize", "randomization"
r'\breinforc\w*\b', # "reinforce", "reinforcer"
r'\breliabilit\w*\b', # "reliability", "reliable"
r'\breplicat\w*\b', # "replication", "replicate"
r'\brepresent\w*[-\s]+visual\w*\b|\bvisual\w*[-\s]+represent\w*\b', # "represent visually", "visual representation"
r'\bresponse\w*\b', # "response", "responses"
r'\bguided[-\s]*response\w*\b|\bresponse[-\s]*guided\b', # "guided response", "response guided"
r'\brigor\w*\b', # "rigor", "rigorous"
r'\brisk\w*[-\s]+assess\w*\b', # "risk assessment", "risk-assessments"
r'\brubric\w*\b', # "rubric", "rubrics"
r'\bscreen\w*\b', # "screen", "screening"
r'\bsetting\w*\b', # "setting", "settings"
r'\bstandard\w*\b', # "standard", "standards"
r'\bstimul\w*\b', # "stimulus", "stimuli"
r'\bsynthes\w*\b', # "synthesize", "synthesis"
r'\btechnique\w*\b', # "technique", "techniques"
r'\btest\w*\b', # "test", "testing"
r'\btrend\w*\b', # "trend", "trending"
r'\btreatment\w*\b', # "treatment", "treatments"
r'\bvalidit\w*\b', # "validity", "validities"
r'\bvisual\w*[-\s]*analy\w*\b|\banaly\w*[-\s]*visual\w*\b', # "visual analysis", "analyze visually"
r'\bwhat[-\s]+works[-\s]+clearinghouse\b' # "what works clearinghouse", "what-works-clearinghouse"
]))
pattern_list = [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
# Filter rows based on the patterns
all_data['methodology'] = all_data['combined_text'].apply(
lambda text: 'Yes' if any(pattern.search(text) for pattern in pattern_list) else 'No'
)
# Count the number of 'Yes' predictions
methodology_count = all_data['methodology'].value_counts().get("Yes", 0)
print(f"Number of 'Yes' predictions for methodology: {methodology_count}")
Number of 'Yes' predictions for methodology: 5879
In [8]:
# Filter publications
all_data['filtered'] = all_data.apply(lambda row: 'Yes' if row['single_case'] == 'Yes' and row['technology_use'] == 'Yes' and row['methodology'] == 'Yes' else 'No', axis=1)
filtered_pubs = all_data['filtered'].value_counts().get("Yes", 0)
print(f"Number of 'Yes' predictions for filtered studies: {filtered_pubs}")
# Print the number of rows after filtering
all_data_name = f"files/all_data.xlsx"
all_data.to_excel(all_data_name, index=False)
print(f"Data saved to {all_data_name}")
Number of 'Yes' predictions for filtered studies: 3637 Data saved to files/all_data.xlsx
In [9]:
from nbconvert import HTMLExporter
import nbformat
notebook_path = 'index.ipynb'
html_exporter = HTMLExporter()
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
notebook_content = nb_file.read()
notebook = nbformat.reads(notebook_content, as_version=4)
if 'widgets' in notebook.metadata and 'application/vnd.jupyter.widget-state+json' in notebook.metadata['widgets']:
if 'state' not in notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']:
notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']['state'] = {}
html_output, _ = html_exporter.from_notebook_node(notebook)
with open('index.html', 'w', encoding='utf-8') as html_file:
html_file.write(html_output)