Publication Screening on Methodology¶


In [1]:
from datetime import datetime
date = datetime.now()
formatted_date = date.strftime("%B %d, %Y")
print(formatted_date)
February 28, 2026

0. Configuring the System Environment¶

In [2]:
# Check GPU memory
!nvidia-smi
Sat Feb 28 17:49:54 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             46W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+
In [3]:
# Check system RAM
!free -h
               total        used        free      shared  buff/cache   available
Mem:            83Gi       855Mi        78Gi       2.0Mi       3.7Gi        81Gi
Swap:             0B          0B          0B

1. Setting Up the Computing Environment¶

Install and load Python libraries.¶
In [4]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
userdata.get('HF_TOKEN')

# Update this path to your project directory in Google Drive
%cd /content/drive/My\ Drive/Colab\ Notebooks/LLM/sped_biblio/methodology
Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/LLM/sped_biblio/methodology

2. Screening Studies on Methodology¶

In [5]:
import pandas as pd
import re
In [6]:
all_data_file = f"files/all_data_technology_use.xlsx"
all_data = pd.read_excel(all_data_file, na_filter=False)

# Combine text columns
text_cols = ['TI', 'AB', 'DE']
all_data['combined_text'] = all_data[text_cols].astype(str).agg(' '.join, axis=1)
In [7]:
patterns = sorted(set([
    r'\banaly\w*\b',  # "analyze", "analysis"
    r'\bartificial\w*[-\s]+intelligen\w*\b',  # "artificial intelligence", "artificial-intelligence"
    r'\bbehavior\w*[-\s]+chang\w*\b|\bchang\w*[-\s]*behavior\w*\b',  # "behavior change", "change behavior"
    r'\bbehavior\w*\b',  # "behavior", "behaviors"
    r'\bcollect\w*[-\s]+data\b|\bdata[-\s]*collect\w*\b',  # "collect data", "data collection"
    r'\bconstruct\w*[-\s]+graph\w*\b|\bgraph[-\s]*construct\w*\b',  # "construct graph", "graph construct"
    r'\bcreate\w*[-\s]+graph\w*\b|\bgraph[-\s]*create\w*\b',  # "create graph", "graph create"
    r'\bdata[-\s]+analy\w*\b|\banaly\w*[-\s]*data\b',  # "data analysis", "analyze data"
    r'\bdata[-\s]*based\b',  # "data-based", "databased"
    r'\bdata[-\s]*driven\b',  # "data-driven", "data driven"
    r'\bdata[-\s]*extract\w*\b|\bextract\w*[-\s]+data\b',  # "data extraction", "extract data"
    r'\bdata[-\s]*gather\w*\b|\bgather\w*[-\s]*data\b',  # "data gathering", "gather data"
    r'\bdata[-\s]*informed\b',  # "data-informed", "data informed"
    r'\bdata[-\s]*record\w*\b|\brecord\w*[-\s]+data\b',  # "data recording", "record data"
    r'\bdata[-\s]*retriev\w*\b|\bretriev\w*[-\s]*data\b',  # "data retrieving", "retrieve data"
    r'\bdata[-\s]*shar\w*\b|\bshar\w*[-\s]+data\b',  # "data sharing", "share data"
    r'\bdecision\w*[-\s]*mak\w*\b|\bmak\w*[-\s]*decision\w*\b',  # "decision-making", "make decisions"
    r'\bdependent[-\s]+variable\w*\b',  # "dependent variable", "dependent variables"
    r'\bdesign\w*\b',  # "design", "designing"
    r'\bdual\w*[-\s]+criteri\w*\b',  # "dual criteria", "dual-criterion"
    r'\beffect\w*[-\s]+size\w*\b',  # "effect size", "effect sizes"
    r'\bethic\w*\b',  # "ethic", "ethics"
    r'\bevidence[-\s]*base\w*\b',  # "evidence-based", "evidencebased"
    r'\bevidence\w*\b',  # "evidence", "evidences"
    r'\bexamin\w*\b',  # "examine", "examination"
    r'\bexperiment\w*\b',  # "experiment", "experimental"
    r'\bfunction\w*[-\s]+relation\w*\b',  # "functional relationship", "function-related"
    r'\bgeneraliz\w*\b',  # "generalize", "generalization"
    r'\bindependent[-\s]+variable\w*\b',  # "independent variable", "independent variables"
    r'\binspect\w*[-\s]+visual\w*\b|\bvisual\w*[-\s]+inspect\w*\b',  # "inspect visually", "visual inspection"
    r'\binterrupted[-\s]+time[-\s]+seri\w*\b',  # "interrupted time series", "interrupted-timeseries"
    r'\blevel[-\s]*chang\w*\b|\bchang\w*[-\s]*level\b',  # "level change", "change level"
    r'\bmachine\w*[-\s]+learn\w*\b',  # "machine learning", "machine-learning"
    r'\bmaintain\w*\b',  # "maintain", "maintenance"
    r'\bmeasure\w*\b',  # "measure", "measurement"
    r'\bmeta\w*[-\s]*analys\w*\b',  # "meta-analysis", "meta analysis"
    r'\bmethod\w*\b',  # "method", "methodology"
    r'\bmodel\w*\b',  # "model", "modeling"
    r'\bmoment[-\s]+to[-\s]+moment\b',  # "moment-to-moment", "moment to moment"
    r'\bmulti\w*level\w*\b',  # "multilevel", "multi-level"
    r'\bN[-\s]*of[-\s]*1\w*\b',  # "N-of-1", "N of 1"
    r'\bnetwork\w*\b',  # "network", "networking"
    r'\bnon\w*parametric\w*\b',  # "non-parametric", "nonparametric"
    r'\bopen[-\s]+science\b',  # "open science", "open-science"
    r'\boverlap\w*\b',  # "overlap", "overlapping"
    r'\bparametric\w*\b',  # "parametric", "parametrics"
    r'\bphase\w*\b',  # "phase", "phases"
    r'\bpre\w*[-\s]*register\w*\b',  # "pre-register", "preregister"
    r'\bPRISMA\b',  # "PRISMA", "prisma"
    r'\bprocedure\w*\b',  # "procedure", "procedures"
    r'\bprocess\w*\b',  # "process", "processes"
    r'\bprompt\w*\b',  # "prompt", "prompting"
    r'\bprotocol\b',  # "protocol", "protocols"
    r'\bqualit\w*[-\s]+indicator\w*\b',  # "quality indicator", "quality-indicators"
    r'\bquantif\w*\b',  # "quantify", "quantification"
    r'\brandomiz\w*\b',  # "randomize", "randomization"
    r'\breinforc\w*\b',  # "reinforce", "reinforcer"
    r'\breliabilit\w*\b',  # "reliability", "reliable"
    r'\breplicat\w*\b',  # "replication", "replicate"
    r'\brepresent\w*[-\s]+visual\w*\b|\bvisual\w*[-\s]+represent\w*\b',  # "represent visually", "visual representation"
    r'\bresponse\w*\b',  # "response", "responses"
    r'\bguided[-\s]*response\w*\b|\bresponse[-\s]*guided\b',  # "guided response", "response guided"
    r'\brigor\w*\b',  # "rigor", "rigorous"
    r'\brisk\w*[-\s]+assess\w*\b',  # "risk assessment", "risk-assessments"
    r'\brubric\w*\b',  # "rubric", "rubrics"
    r'\bscreen\w*\b',  # "screen", "screening"
    r'\bsetting\w*\b',  # "setting", "settings"
    r'\bstandard\w*\b',  # "standard", "standards"
    r'\bstimul\w*\b',  # "stimulus", "stimuli"
    r'\bsynthes\w*\b',  # "synthesize", "synthesis"
    r'\btechnique\w*\b',  # "technique", "techniques"
    r'\btest\w*\b',  # "test", "testing"
    r'\btrend\w*\b',  # "trend", "trending"
    r'\btreatment\w*\b',  # "treatment", "treatments"
    r'\bvalidit\w*\b',  # "validity", "validities"
    r'\bvisual\w*[-\s]*analy\w*\b|\banaly\w*[-\s]*visual\w*\b',  # "visual analysis", "analyze visually"
    r'\bwhat[-\s]+works[-\s]+clearinghouse\b'  # "what works clearinghouse", "what-works-clearinghouse"
]))

pattern_list = [re.compile(pattern, re.IGNORECASE) for pattern in patterns]

# Filter rows based on the patterns
all_data['methodology'] = all_data['combined_text'].apply(
    lambda text: 'Yes' if any(pattern.search(text) for pattern in pattern_list) else 'No'
)

# Count the number of 'Yes' predictions
methodology_count = all_data['methodology'].value_counts().get("Yes", 0)
print(f"Number of 'Yes' predictions for methodology: {methodology_count}")
Number of 'Yes' predictions for methodology: 5879
In [8]:
# Filter publications
all_data['filtered'] = all_data.apply(lambda row: 'Yes' if row['single_case'] == 'Yes' and row['technology_use'] == 'Yes' and row['methodology'] == 'Yes' else 'No', axis=1)
filtered_pubs = all_data['filtered'].value_counts().get("Yes", 0)
print(f"Number of 'Yes' predictions for filtered studies: {filtered_pubs}")

# Print the number of rows after filtering
all_data_name = f"files/all_data.xlsx"
all_data.to_excel(all_data_name, index=False)
print(f"Data saved to {all_data_name}")
Number of 'Yes' predictions for filtered studies: 3637
Data saved to files/all_data.xlsx
In [9]:
from nbconvert import HTMLExporter
import nbformat

notebook_path = 'index.ipynb'
html_exporter = HTMLExporter()

with open(notebook_path, 'r', encoding='utf-8') as nb_file:
    notebook_content = nb_file.read()
    notebook = nbformat.reads(notebook_content, as_version=4)

if 'widgets' in notebook.metadata and 'application/vnd.jupyter.widget-state+json' in notebook.metadata['widgets']:
    if 'state' not in notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']:
        notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']['state'] = {}

html_output, _ = html_exporter.from_notebook_node(notebook)

with open('index.html', 'w', encoding='utf-8') as html_file:
    html_file.write(html_output)