Publication Screening on Using Technology¶
In [1]:
from datetime import datetime
date = datetime.now()
formatted_date = date.strftime("%B %d, %Y")
print(formatted_date)
February 28, 2026
0. Configuring the System Environment¶
In [2]:
# Check GPU memory
!nvidia-smi
Sat Feb 28 17:22:43 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.82.07 Driver Version: 580.82.07 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA A100-SXM4-40GB Off | 00000000:00:04.0 Off | 0 | | N/A 33C P0 47W / 400W | 0MiB / 40960MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+
In [3]:
# Check system RAM
!free -h
total used free shared buff/cache available Mem: 83Gi 944Mi 78Gi 3.0Mi 3.7Gi 81Gi Swap: 0B 0B 0B
In [4]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
userdata.get('HF_TOKEN')
# Update this path to your project directory in Google Drive
%cd /content/drive/My\ Drive/Colab\ Notebooks/LLM/sped_biblio/screening_technology_use
Mounted at /content/drive /content/drive/My Drive/Colab Notebooks/LLM/sped_biblio/screening_technology_use
In [5]:
# Optional: install from conda environment
# !pip install -q condacolab
# !conda env create -f python-venv-environment.yml
In [6]:
!pip install -q pubmlp
In [7]:
import warnings
_orig_showwarning = warnings.showwarning
def _filtered_showwarning(msg, cat, fn, ln, file=None, line=None):
if 'jupyter_client' not in str(fn):
_orig_showwarning(msg, cat, fn, ln, file, line)
warnings.showwarning = _filtered_showwarning
import pandas as pd
import torch
from torch.optim import AdamW
import torch.nn as nn
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from transformers import AutoTokenizer
from pubmlp import (
Config, PubMLP,
split_data, preprocess_dataset, create_dataloader,
train_evaluate_model, predict_model,
get_predictions_and_labels, flag_uncertain,
calculate_evaluation_metrics, plot_results,
cross_validate, calibrate_model,
get_device,
)
In [8]:
warnings.filterwarnings('ignore', category=FutureWarning)
labeled_data = pd.read_excel("files/labeled_data.xlsx")
labeled_df = labeled_data[['UT', 'single_case', 'technology_use', 'SO', 'PY', 'AF', 'TI', 'AB', 'DE']].copy()
missing_values = labeled_df.isnull().sum()
print(missing_values)
df = labeled_df.copy()
df["PY"] = pd.to_numeric(df["PY"], errors='coerce')
df.dropna(subset=["PY"], inplace=True)
df["single_case"] = df["single_case"].map({'Yes': 1, 'No': 0})
df["technology_use"] = df["technology_use"].map({'Yes': 1, 'No': 0})
train_df, validation_df, test_df = split_data(df, random_state=42)
print(f"\nTraining: {len(train_df)}, Validation: {len(validation_df)}, Test: {len(test_df)}")
UT 0 single_case 0 technology_use 0 SO 0 PY 0 AF 0 TI 0 AB 0 DE 269 dtype: int64 Training: 1264, Validation: 158, Test: 158
Preprocess data and convert to dataloaders.¶
In [9]:
warnings.filterwarnings('ignore', category=FutureWarning)
config = Config(
random_seed=2025,
embedding_model='bert',
batch_size=8,
eval_batch_size=8,
epochs=5,
learning_rate=1e-5,
mlp_hidden_size=16,
dropout_rate=0.2,
early_stopping_patience=3,
n_hidden_layers=2,
)
config.set_random_seeds()
device = get_device()
print(f"Device: {device}, Batch size: {config.batch_size}")
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
column_specifications = {
"text_cols": ['AF', 'TI', 'AB', 'DE'],
"categorical_cols": ['SO'],
"numeric_cols": ['PY'],
"label_col": "technology_use",
}
numeric_transform = {'PY': 'min'}
label_col = column_specifications['label_col']
train_dataset, fitted = preprocess_dataset(
train_df, tokenizer, device, column_specifications, numeric_transform,
max_length=config.max_length,
)
train_dataloader = create_dataloader(train_dataset, RandomSampler, config.batch_size)
validation_dataset, _ = preprocess_dataset(
validation_df, tokenizer, device, column_specifications, numeric_transform,
max_length=config.max_length, fitted_transforms=fitted,
)
validation_dataloader = create_dataloader(validation_dataset, SequentialSampler, config.eval_batch_size)
test_dataset, _ = preprocess_dataset(
test_df, tokenizer, device, column_specifications, numeric_transform,
max_length=config.max_length, fitted_transforms=fitted,
)
test_dataloader = create_dataloader(test_dataset, SequentialSampler, config.eval_batch_size)
Device: cuda, Batch size: 8
config.json: 0%| | 0.00/570 [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/48.0 [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
Train the model and perform validation.¶
In [10]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
model = PubMLP(
categorical_vocab_sizes=fitted.categorical_vocab_sizes,
numeric_cols_num=1,
mlp_hidden_size=config.mlp_hidden_size,
output_size=1,
dropout_rate=config.dropout_rate,
embedding_model=config.embedding_model,
model_name=config.model_name,
n_hidden_layers=config.n_hidden_layers,
).to(device)
logging.getLogger("transformers").setLevel(logging.WARNING)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=config.learning_rate, eps=1e-8)
(train_losses, validation_losses,
train_accuracies, validation_accuracies,
test_accuracy, best_val_loss,
best_model_state, best_epoch) = train_evaluate_model(
model, train_dataloader, validation_dataloader, test_dataloader,
optimizer, criterion, device, config.epochs,
early_stopping_patience=config.early_stopping_patience,
gradient_clip_norm=config.gradient_clip_norm,
pos_weight=None,
use_warmup=False,
)
torch.save(best_model_state, f"best_model_state_{label_col}.pth")
model.safetensors: 0%| | 0.00/440M [00:00<?, ?B/s]
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:27<00:00, 5.77it/s, loss=0.549]
Epoch: 0001/0005 | Train Loss: 0.549 | Val Loss: 0.444 *** Best *** Train Acc: 87.816% | Val Acc: 85.443% | 0.62 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.437]
Epoch: 0002/0005 | Train Loss: 0.437 | Val Loss: 0.374 *** Best *** Train Acc: 91.851% | Val Acc: 91.772% | 1.22 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.392]
Epoch: 0003/0005 | Train Loss: 0.392 | Val Loss: 0.355 *** Best *** Train Acc: 91.851% | Val Acc: 90.506% | 1.82 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.353]
Epoch: 0004/0005 | Train Loss: 0.353 | Val Loss: 0.319 *** Best *** Train Acc: 92.959% | Val Acc: 91.772% | 2.42 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.332]
Epoch: 0005/0005 | Train Loss: 0.332 | Val Loss: 0.303 *** Best *** Train Acc: 93.196% | Val Acc: 92.405% | 3.02 min Loading best model from epoch 5 Test Accuracy: 95.570% | Best epoch 5 (val loss: 0.303)
Plot shows training losses, validation losses, training accuracies, validation accuracies, test accuracy, and the best validation loss over time.¶
In [11]:
import matplotlib.pyplot as plt
_show = plt.show
plt.show = lambda *a, **k: None
plot_results(train_losses, validation_losses, train_accuracies, validation_accuracies, test_accuracy, best_val_loss, best_epoch=best_epoch)
plt.show = _show
fig = plt.gcf()
fig.axes[1].legend(loc='lower right')
plt.show()
Evaluation metrics: classification report, confusion matrix, and ROC curve.¶
In [12]:
test_predictions, test_probs, test_labels = get_predictions_and_labels(model, test_dataloader, device)
metrics = calculate_evaluation_metrics(
test_labels, test_predictions, test_probs,
output_dir='files', label_name=label_col, save_figures=True,
)
uncertain_flags = flag_uncertain(test_probs)
print(f"\nUncertain predictions (0.3-0.7): {sum(uncertain_flags)} / {len(uncertain_flags)}")
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 18.84it/s]
EVALUATION METRICS: TECHNOLOGY_USE
precision recall f1-score support
Exclude 1.000 0.720 0.837 25
Include 0.950 1.000 0.974 133
accuracy 0.956 158
macro avg 0.975 0.860 0.906 158
weighted avg 0.958 0.956 0.953 158
Key Metrics:
accuracy: 0.956
precision: 0.950
recall: 1.000
specificity: 0.720
f1_score: 0.974
roc_auc: 0.916
Confusion matrix saved: files/confusion_matrix_technology_use.png
ROC curve saved: files/roc_curve_technology_use.png
Uncertain predictions (0.3-0.7): 18 / 158
Cross-validation for reliable performance estimates.¶
In [13]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
cv_results = cross_validate(
df, tokenizer, device, column_specifications, numeric_transform, config,
numeric_cols_num=1, output_size=1, output_dir='files/cv',
)
logging.getLogger("transformers").setLevel(logging.WARNING)
warnings.filters = [f for f in warnings.filters if not (f[2] is UserWarning)]
print(f"\nMean F1: {cv_results['mean_metrics']['f1_score']:.3f} ± {cv_results['std_metrics']['f1_score']:.3f}")
print(f"Mean ROC AUC: {cv_results['mean_metrics']['roc_auc']:.3f} ± {cv_results['std_metrics']['roc_auc']:.3f}")
============================================================ Fold 1/5 ============================================================ Train: 1264 | Val: 316
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:26<00:00, 6.07it/s, loss=0.265]
Epoch: 0001/0005 | Train Loss: 0.265 | Val Loss: 0.258 *** Best *** Train Acc: 85.997% | Val Acc: 85.443% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.243]
Epoch: 0002/0005 | Train Loss: 0.243 | Val Loss: 0.230 *** Best *** Train Acc: 91.772% | Val Acc: 91.456% | 1.27 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.226]
Epoch: 0003/0005 | Train Loss: 0.226 | Val Loss: 0.214 *** Best *** Train Acc: 92.722% | Val Acc: 92.089% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.21]
Epoch: 0004/0005 | Train Loss: 0.210 | Val Loss: 0.202 *** Best *** Train Acc: 93.038% | Val Acc: 92.722% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.2]
Epoch: 0005/0005 | Train Loss: 0.200 | Val Loss: 0.200 *** Best *** Train Acc: 93.513% | Val Acc: 92.722% | 3.16 min Loading best model from epoch 5 Best epoch 5 (val loss: 0.200)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.80it/s]
EVALUATION METRICS: FOLD_1
precision recall f1-score support
Exclude 0.933 0.677 0.785 62
Include 0.926 0.988 0.956 254
accuracy 0.927 316
macro avg 0.930 0.833 0.871 316
weighted avg 0.928 0.927 0.923 316
Key Metrics:
accuracy: 0.927
precision: 0.926
recall: 0.988
specificity: 0.677
f1_score: 0.956
roc_auc: 0.919
Confusion matrix saved: files/cv/fold_1/confusion_matrix_fold_1.png
ROC curve saved: files/cv/fold_1/roc_curve_fold_1.png
Fold 1 — F1: 0.956 | Precision: 0.926 | Recall: 0.988
============================================================
Fold 2/5
============================================================
Train: 1264 | Val: 316
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00, 6.08it/s, loss=0.258]
Epoch: 0001/0005 | Train Loss: 0.258 | Val Loss: 0.241 *** Best *** Train Acc: 91.772% | Val Acc: 89.241% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.225]
Epoch: 0002/0005 | Train Loss: 0.225 | Val Loss: 0.222 *** Best *** Train Acc: 92.880% | Val Acc: 89.873% | 1.26 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.213]
Epoch: 0003/0005 | Train Loss: 0.213 | Val Loss: 0.213 *** Best *** Train Acc: 93.671% | Val Acc: 90.190% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.202]
Epoch: 0004/0005 | Train Loss: 0.202 | Val Loss: 0.209 *** Best *** Train Acc: 93.908% | Val Acc: 90.190% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.199]
Epoch: 0005/0005 | Train Loss: 0.199 | Val Loss: 0.203 *** Best *** Train Acc: 93.987% | Val Acc: 91.139% | 3.16 min Loading best model from epoch 5 Best epoch 5 (val loss: 0.203)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.86it/s]
EVALUATION METRICS: FOLD_2
precision recall f1-score support
Exclude 0.870 0.645 0.741 62
Include 0.919 0.976 0.947 254
accuracy 0.911 316
macro avg 0.894 0.811 0.844 316
weighted avg 0.909 0.911 0.906 316
Key Metrics:
accuracy: 0.911
precision: 0.919
recall: 0.976
specificity: 0.645
f1_score: 0.947
roc_auc: 0.896
Confusion matrix saved: files/cv/fold_2/confusion_matrix_fold_2.png
ROC curve saved: files/cv/fold_2/roc_curve_fold_2.png
Fold 2 — F1: 0.947 | Precision: 0.919 | Recall: 0.976
============================================================
Fold 3/5
============================================================
Train: 1264 | Val: 316
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00, 6.08it/s, loss=0.274]
Epoch: 0001/0005 | Train Loss: 0.274 | Val Loss: 0.263 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.256]
Epoch: 0002/0005 | Train Loss: 0.256 | Val Loss: 0.242 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 1.27 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.242]
Epoch: 0003/0005 | Train Loss: 0.242 | Val Loss: 0.235 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.235]
Epoch: 0004/0005 | Train Loss: 0.235 | Val Loss: 0.231 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.227]
Epoch: 0005/0005 | Train Loss: 0.227 | Val Loss: 0.228 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 3.16 min Loading best model from epoch 5 Best epoch 5 (val loss: 0.228)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.66it/s]
EVALUATION METRICS: FOLD_3
precision recall f1-score support
Exclude 0.000 0.000 0.000 61
Include 0.807 1.000 0.893 255
accuracy 0.807 316
macro avg 0.403 0.500 0.447 316
weighted avg 0.651 0.807 0.721 316
Key Metrics:
accuracy: 0.807
precision: 0.807
recall: 1.000
specificity: 0.000
f1_score: 0.893
roc_auc: 0.940
Confusion matrix saved: files/cv/fold_3/confusion_matrix_fold_3.png
ROC curve saved: files/cv/fold_3/roc_curve_fold_3.png
Fold 3 — F1: 0.893 | Precision: 0.807 | Recall: 1.000
============================================================
Fold 4/5
============================================================
Train: 1264 | Val: 316
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00, 6.08it/s, loss=0.259]
Epoch: 0001/0005 | Train Loss: 0.259 | Val Loss: 0.240 *** Best *** Train Acc: 91.535% | Val Acc: 89.873% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.234]
Epoch: 0002/0005 | Train Loss: 0.234 | Val Loss: 0.226 *** Best *** Train Acc: 92.880% | Val Acc: 90.506% | 1.27 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.22]
Epoch: 0003/0005 | Train Loss: 0.220 | Val Loss: 0.218 *** Best *** Train Acc: 93.513% | Val Acc: 90.190% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.212]
Epoch: 0004/0005 | Train Loss: 0.212 | Val Loss: 0.213 *** Best *** Train Acc: 93.671% | Val Acc: 90.506% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.206]
Epoch: 0005/0005 | Train Loss: 0.206 | Val Loss: 0.212 *** Best *** Train Acc: 93.908% | Val Acc: 89.873% | 3.16 min Loading best model from epoch 5 Best epoch 5 (val loss: 0.212)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.89it/s]
EVALUATION METRICS: FOLD_4
precision recall f1-score support
Exclude 0.822 0.607 0.698 61
Include 0.911 0.969 0.939 255
accuracy 0.899 316
macro avg 0.867 0.788 0.819 316
weighted avg 0.894 0.899 0.893 316
Key Metrics:
accuracy: 0.899
precision: 0.911
recall: 0.969
specificity: 0.607
f1_score: 0.939
roc_auc: 0.848
Confusion matrix saved: files/cv/fold_4/confusion_matrix_fold_4.png
ROC curve saved: files/cv/fold_4/roc_curve_fold_4.png
Fold 4 — F1: 0.939 | Precision: 0.911 | Recall: 0.969
============================================================
Fold 5/5
============================================================
Train: 1264 | Val: 316
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00, 6.08it/s, loss=0.261]
Epoch: 0001/0005 | Train Loss: 0.261 | Val Loss: 0.248 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.248]
Epoch: 0002/0005 | Train Loss: 0.248 | Val Loss: 0.247 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 1.26 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00, 6.10it/s, loss=0.237]
Epoch: 0003/0005 | Train Loss: 0.237 | Val Loss: 0.242 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 1.89 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.225]
Epoch: 0004/0005 | Train Loss: 0.225 | Val Loss: 0.233 *** Best *** Train Acc: 80.538% | Val Acc: 80.696% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00, 6.09it/s, loss=0.214]
Epoch: 0005/0005 | Train Loss: 0.214 | Val Loss: 0.240 Train Acc: 80.538% | Val Acc: 80.696% | 3.16 min Loading best model from epoch 4 Best epoch 4 (val loss: 0.233)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.76it/s]
EVALUATION METRICS: FOLD_5
precision recall f1-score support
Exclude 0.000 0.000 0.000 61
Include 0.807 1.000 0.893 255
accuracy 0.807 316
macro avg 0.403 0.500 0.447 316
weighted avg 0.651 0.807 0.721 316
Key Metrics:
accuracy: 0.807
precision: 0.807
recall: 1.000
specificity: 0.000
f1_score: 0.893
roc_auc: 0.902
Confusion matrix saved: files/cv/fold_5/confusion_matrix_fold_5.png
ROC curve saved: files/cv/fold_5/roc_curve_fold_5.png
Fold 5 — F1: 0.893 | Precision: 0.807 | Recall: 1.000
============================================================
Cross-Validation Summary (5 folds)
============================================================
accuracy: 0.870 ± 0.052
precision: 0.874 ± 0.055
recall: 0.987 ± 0.013
specificity: 0.386 ± 0.316
f1_score: 0.926 ± 0.027
roc_auc: 0.901 ± 0.030
Best fold: 1 (val acc: 92.722%)
Mean F1: 0.926 ± 0.027
Mean ROC AUC: 0.901 ± 0.030
Calibrate model probabilities via temperature scaling.¶
In [14]:
calibration = calibrate_model(model, validation_dataloader, device)
print(f"Temperature: {calibration.temperature:.3f}")
Temperature: 0.941
Filter rows where "technology_use" is NA (not labeled).¶
In [15]:
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='huggingface_hub')
unlabeled_data = pd.read_excel("files/all_data_single_case.xlsx")
unlabeled_df = unlabeled_data[['UT', 'single_case', 'technology_use', 'SO', 'PY', 'AF', 'TI', 'AB', 'DE']].copy()
unlabeled_df["PY"] = pd.to_numeric(unlabeled_df["PY"], errors="coerce")
missing_values = unlabeled_df.isnull().sum()
print(missing_values)
UT 0 single_case 0 technology_use 4349 SO 0 PY 0 AF 0 TI 0 AB 129 DE 912 dtype: int64
Load the BERT tokenizer.¶
In [16]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
Preprocess and create dataloaders.¶
In [17]:
column_specifications = {
"text_cols": ['AF', 'TI', 'AB', 'DE'],
"categorical_cols": ['SO'],
"numeric_cols": ['PY'],
"label_col": "technology_use",
}
numeric_transform = {'PY': 'min'}
label_col = column_specifications['label_col']
# Apply fitted transforms from training (no data leakage)
unlabeled_dataset, _ = preprocess_dataset(
unlabeled_df, tokenizer, device, column_specifications, numeric_transform,
max_length=config.max_length, fitted_transforms=fitted,
)
unlabeled_dataloader = create_dataloader(unlabeled_dataset, SequentialSampler, config.eval_batch_size)
Load and initialize the PubMLP model.¶
In [18]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
model = PubMLP(
categorical_vocab_sizes=fitted.categorical_vocab_sizes,
numeric_cols_num=1,
mlp_hidden_size=config.mlp_hidden_size,
output_size=1,
dropout_rate=config.dropout_rate,
embedding_model=config.embedding_model,
model_name=config.model_name,
n_hidden_layers=config.n_hidden_layers,
).to(device)
logging.getLogger("transformers").setLevel(logging.WARNING)
model.load_state_dict(torch.load(f"best_model_state_{label_col}.pth", map_location=device))
Loading weights: 0%| | 0/199 [00:00<?, ?it/s]
Out[18]:
<All keys matched successfully>
Predict each publication.¶
In [19]:
predictions, probabilities = predict_model(model, unlabeled_dataloader, device, calibration=calibration)
uncertain = flag_uncertain(probabilities, low=config.uncertainty_low, high=config.uncertainty_high)
print(f"Uncertain predictions ({config.uncertainty_low}-{config.uncertainty_high}): {sum(uncertain)} / {len(uncertain)}")
predictions_data = pd.DataFrame({
label_col: ['Yes' if p == 1 else 'No' for p in predictions],
'probability': probabilities,
'uncertain': uncertain,
})
if len(predictions_data) == len(unlabeled_df):
unlabeled_df[label_col] = predictions_data[label_col].values
unlabeled_df['probability'] = predictions_data['probability'].values
unlabeled_df['uncertain'] = predictions_data['uncertain'].values
predicted_data = unlabeled_df
else:
print("The number of rows in predictions_df and unlabeled_df does not match.")
predicted_file = f"files/predicted_data_{label_col}.xlsx"
predicted_data.to_excel(predicted_file, index=False)
print(f"Data saved to {predicted_file}")
screened_pubs = (predicted_data[label_col] == 'Yes').sum()
print(f"'Yes' predictions for {label_col}: {screened_pubs}")
Predicting: 100%|██████████| 544/544 [00:27<00:00, 19.62it/s]
Uncertain predictions (0.3-0.7): 1855 / 4349 Data saved to files/predicted_data_technology_use.xlsx 'Yes' predictions for technology_use: 2540
In [20]:
merged_data = pd.concat([labeled_data, predicted_data], ignore_index=True, sort=False)
init_all_data = pd.read_csv("files/init_all_data.csv")
all_data = pd.merge(init_all_data, merged_data, on="UT", how="left", suffixes=("", "_y"))
all_data = all_data.loc[:, ~all_data.columns.str.endswith("_y")]
all_data.columns = all_data.columns.str.replace("_x$", "", regex=True)
all_data_file = f"files/all_data_{label_col}.xlsx"
all_data.to_excel(all_data_file, index=False)
print(f"Data saved to {all_data_file}")
all_data['included'] = all_data.apply(
lambda row: 'Yes' if row['single_case'] == 'Yes' and row['technology_use'] == 'Yes' else 'No', axis=1
)
included_pubs = (all_data['included'] == 'Yes').sum()
print(f"Included studies (single_case=Yes AND technology_use=Yes): {included_pubs}")
Data saved to files/all_data_technology_use.xlsx Included studies (single_case=Yes AND technology_use=Yes): 3677
In [21]:
from nbconvert import HTMLExporter
import nbformat
notebook_path = 'index.ipynb'
html_exporter = HTMLExporter()
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
notebook_content = nb_file.read()
notebook = nbformat.reads(notebook_content, as_version=4)
if 'widgets' in notebook.metadata and 'application/vnd.jupyter.widget-state+json' in notebook.metadata['widgets']:
if 'state' not in notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']:
notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']['state'] = {}
html_output, _ = html_exporter.from_notebook_node(notebook)
with open('index.html', 'w', encoding='utf-8') as html_file:
html_file.write(html_output)