Publication Screening on Using Technology¶


In [1]:
from datetime import datetime
date = datetime.now()
formatted_date = date.strftime("%B %d, %Y")
print(formatted_date)
February 28, 2026

0. Configuring the System Environment¶

In [2]:
# Check GPU memory
!nvidia-smi
Sat Feb 28 17:22:43 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             47W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+
In [3]:
# Check system RAM
!free -h
               total        used        free      shared  buff/cache   available
Mem:            83Gi       944Mi        78Gi       3.0Mi       3.7Gi        81Gi
Swap:             0B          0B          0B

1. Setting Up the Computing Environment¶

Install and load Python libraries.¶
In [4]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
userdata.get('HF_TOKEN')

# Update this path to your project directory in Google Drive
%cd /content/drive/My\ Drive/Colab\ Notebooks/LLM/sped_biblio/screening_technology_use
Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/LLM/sped_biblio/screening_technology_use
In [5]:
# Optional: install from conda environment
# !pip install -q condacolab
# !conda env create -f python-venv-environment.yml
In [6]:
!pip install -q pubmlp
In [7]:
import warnings
_orig_showwarning = warnings.showwarning
def _filtered_showwarning(msg, cat, fn, ln, file=None, line=None):
    if 'jupyter_client' not in str(fn):
        _orig_showwarning(msg, cat, fn, ln, file, line)
warnings.showwarning = _filtered_showwarning

import pandas as pd
import torch
from torch.optim import AdamW
import torch.nn as nn
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from transformers import AutoTokenizer

from pubmlp import (
    Config, PubMLP,
    split_data, preprocess_dataset, create_dataloader,
    train_evaluate_model, predict_model,
    get_predictions_and_labels, flag_uncertain,
    calculate_evaluation_metrics, plot_results,
    cross_validate, calibrate_model,
    get_device,
)

2. Screening Studies on Using Technology¶

Prepare data for training, validation, and testing.¶
In [8]:
warnings.filterwarnings('ignore', category=FutureWarning)

labeled_data = pd.read_excel("files/labeled_data.xlsx")

labeled_df = labeled_data[['UT', 'single_case', 'technology_use', 'SO', 'PY', 'AF', 'TI', 'AB', 'DE']].copy()

missing_values = labeled_df.isnull().sum()
print(missing_values)

df = labeled_df.copy()
df["PY"] = pd.to_numeric(df["PY"], errors='coerce')
df.dropna(subset=["PY"], inplace=True)
df["single_case"] = df["single_case"].map({'Yes': 1, 'No': 0})
df["technology_use"] = df["technology_use"].map({'Yes': 1, 'No': 0})

train_df, validation_df, test_df = split_data(df, random_state=42)

print(f"\nTraining: {len(train_df)}, Validation: {len(validation_df)}, Test: {len(test_df)}")
UT                  0
single_case         0
technology_use      0
SO                  0
PY                  0
AF                  0
TI                  0
AB                  0
DE                269
dtype: int64

Training: 1264, Validation: 158, Test: 158
Preprocess data and convert to dataloaders.¶
In [9]:
warnings.filterwarnings('ignore', category=FutureWarning)

config = Config(
    random_seed=2025,
    embedding_model='bert',
    batch_size=8,
    eval_batch_size=8,
    epochs=5,
    learning_rate=1e-5,
    mlp_hidden_size=16,
    dropout_rate=0.2,
    early_stopping_patience=3,
    n_hidden_layers=2,
)
config.set_random_seeds()

device = get_device()
print(f"Device: {device}, Batch size: {config.batch_size}")

tokenizer = AutoTokenizer.from_pretrained(config.model_name)

column_specifications = {
    "text_cols": ['AF', 'TI', 'AB', 'DE'],
    "categorical_cols": ['SO'],
    "numeric_cols": ['PY'],
    "label_col": "technology_use",
}

numeric_transform = {'PY': 'min'}
label_col = column_specifications['label_col']

train_dataset, fitted = preprocess_dataset(
    train_df, tokenizer, device, column_specifications, numeric_transform,
    max_length=config.max_length,
)
train_dataloader = create_dataloader(train_dataset, RandomSampler, config.batch_size)

validation_dataset, _ = preprocess_dataset(
    validation_df, tokenizer, device, column_specifications, numeric_transform,
    max_length=config.max_length, fitted_transforms=fitted,
)
validation_dataloader = create_dataloader(validation_dataset, SequentialSampler, config.eval_batch_size)

test_dataset, _ = preprocess_dataset(
    test_df, tokenizer, device, column_specifications, numeric_transform,
    max_length=config.max_length, fitted_transforms=fitted,
)
test_dataloader = create_dataloader(test_dataset, SequentialSampler, config.eval_batch_size)
Device: cuda, Batch size: 8
config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]
vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]
tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]
Train the model and perform validation.¶
In [10]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

model = PubMLP(
    categorical_vocab_sizes=fitted.categorical_vocab_sizes,
    numeric_cols_num=1,
    mlp_hidden_size=config.mlp_hidden_size,
    output_size=1,
    dropout_rate=config.dropout_rate,
    embedding_model=config.embedding_model,
    model_name=config.model_name,
    n_hidden_layers=config.n_hidden_layers,
).to(device)

logging.getLogger("transformers").setLevel(logging.WARNING)

criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=config.learning_rate, eps=1e-8)

(train_losses, validation_losses,
 train_accuracies, validation_accuracies,
 test_accuracy, best_val_loss,
 best_model_state, best_epoch) = train_evaluate_model(
    model, train_dataloader, validation_dataloader, test_dataloader,
    optimizer, criterion, device, config.epochs,
    early_stopping_patience=config.early_stopping_patience,
    gradient_clip_norm=config.gradient_clip_norm,
    pos_weight=None,
    use_warmup=False,
)

torch.save(best_model_state, f"best_model_state_{label_col}.pth")
model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:27<00:00,  5.77it/s, loss=0.549]
Epoch: 0001/0005 | Train Loss: 0.549 | Val Loss: 0.444 *** Best ***
Train Acc: 87.816% | Val Acc: 85.443% | 0.62 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.437]
Epoch: 0002/0005 | Train Loss: 0.437 | Val Loss: 0.374 *** Best ***
Train Acc: 91.851% | Val Acc: 91.772% | 1.22 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.392]
Epoch: 0003/0005 | Train Loss: 0.392 | Val Loss: 0.355 *** Best ***
Train Acc: 91.851% | Val Acc: 90.506% | 1.82 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.353]
Epoch: 0004/0005 | Train Loss: 0.353 | Val Loss: 0.319 *** Best ***
Train Acc: 92.959% | Val Acc: 91.772% | 2.42 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.332]
Epoch: 0005/0005 | Train Loss: 0.332 | Val Loss: 0.303 *** Best ***
Train Acc: 93.196% | Val Acc: 92.405% | 3.02 min

Loading best model from epoch 5
Test Accuracy: 95.570% | Best epoch 5 (val loss: 0.303)
Plot shows training losses, validation losses, training accuracies, validation accuracies, test accuracy, and the best validation loss over time.¶
In [11]:
import matplotlib.pyplot as plt
_show = plt.show
plt.show = lambda *a, **k: None
plot_results(train_losses, validation_losses, train_accuracies, validation_accuracies, test_accuracy, best_val_loss, best_epoch=best_epoch)
plt.show = _show
fig = plt.gcf()
fig.axes[1].legend(loc='lower right')
plt.show()
No description has been provided for this image
Evaluation metrics: classification report, confusion matrix, and ROC curve.¶
In [12]:
test_predictions, test_probs, test_labels = get_predictions_and_labels(model, test_dataloader, device)

metrics = calculate_evaluation_metrics(
    test_labels, test_predictions, test_probs,
    output_dir='files', label_name=label_col, save_figures=True,
)

uncertain_flags = flag_uncertain(test_probs)
print(f"\nUncertain predictions (0.3-0.7): {sum(uncertain_flags)} / {len(uncertain_flags)}")
Evaluating: 100%|██████████| 20/20 [00:01<00:00, 18.84it/s]
EVALUATION METRICS: TECHNOLOGY_USE
              precision    recall  f1-score   support

     Exclude      1.000     0.720     0.837        25
     Include      0.950     1.000     0.974       133

    accuracy                          0.956       158
   macro avg      0.975     0.860     0.906       158
weighted avg      0.958     0.956     0.953       158

Key Metrics:
  accuracy: 0.956
  precision: 0.950
  recall: 1.000
  specificity: 0.720
  f1_score: 0.974
  roc_auc: 0.916

Confusion matrix saved: files/confusion_matrix_technology_use.png
ROC curve saved: files/roc_curve_technology_use.png

Uncertain predictions (0.3-0.7): 18 / 158
Cross-validation for reliable performance estimates.¶
In [13]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

cv_results = cross_validate(
    df, tokenizer, device, column_specifications, numeric_transform, config,
    numeric_cols_num=1, output_size=1, output_dir='files/cv',
)

logging.getLogger("transformers").setLevel(logging.WARNING)
warnings.filters = [f for f in warnings.filters if not (f[2] is UserWarning)]

print(f"\nMean F1: {cv_results['mean_metrics']['f1_score']:.3f} ± {cv_results['std_metrics']['f1_score']:.3f}")
print(f"Mean ROC AUC: {cv_results['mean_metrics']['roc_auc']:.3f} ± {cv_results['std_metrics']['roc_auc']:.3f}")
============================================================
Fold 1/5
============================================================
Train: 1264 | Val: 316
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:26<00:00,  6.07it/s, loss=0.265]
Epoch: 0001/0005 | Train Loss: 0.265 | Val Loss: 0.258 *** Best ***
Train Acc: 85.997% | Val Acc: 85.443% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.243]
Epoch: 0002/0005 | Train Loss: 0.243 | Val Loss: 0.230 *** Best ***
Train Acc: 91.772% | Val Acc: 91.456% | 1.27 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.226]
Epoch: 0003/0005 | Train Loss: 0.226 | Val Loss: 0.214 *** Best ***
Train Acc: 92.722% | Val Acc: 92.089% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.21]
Epoch: 0004/0005 | Train Loss: 0.210 | Val Loss: 0.202 *** Best ***
Train Acc: 93.038% | Val Acc: 92.722% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.2]
Epoch: 0005/0005 | Train Loss: 0.200 | Val Loss: 0.200 *** Best ***
Train Acc: 93.513% | Val Acc: 92.722% | 3.16 min

Loading best model from epoch 5
Best epoch 5 (val loss: 0.200)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.80it/s]
EVALUATION METRICS: FOLD_1
              precision    recall  f1-score   support

     Exclude      0.933     0.677     0.785        62
     Include      0.926     0.988     0.956       254

    accuracy                          0.927       316
   macro avg      0.930     0.833     0.871       316
weighted avg      0.928     0.927     0.923       316

Key Metrics:
  accuracy: 0.927
  precision: 0.926
  recall: 0.988
  specificity: 0.677
  f1_score: 0.956
  roc_auc: 0.919

Confusion matrix saved: files/cv/fold_1/confusion_matrix_fold_1.png
ROC curve saved: files/cv/fold_1/roc_curve_fold_1.png
Fold 1 — F1: 0.956 | Precision: 0.926 | Recall: 0.988

============================================================
Fold 2/5
============================================================
Train: 1264 | Val: 316
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00,  6.08it/s, loss=0.258]
Epoch: 0001/0005 | Train Loss: 0.258 | Val Loss: 0.241 *** Best ***
Train Acc: 91.772% | Val Acc: 89.241% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.225]
Epoch: 0002/0005 | Train Loss: 0.225 | Val Loss: 0.222 *** Best ***
Train Acc: 92.880% | Val Acc: 89.873% | 1.26 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.213]
Epoch: 0003/0005 | Train Loss: 0.213 | Val Loss: 0.213 *** Best ***
Train Acc: 93.671% | Val Acc: 90.190% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.202]
Epoch: 0004/0005 | Train Loss: 0.202 | Val Loss: 0.209 *** Best ***
Train Acc: 93.908% | Val Acc: 90.190% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.199]
Epoch: 0005/0005 | Train Loss: 0.199 | Val Loss: 0.203 *** Best ***
Train Acc: 93.987% | Val Acc: 91.139% | 3.16 min

Loading best model from epoch 5
Best epoch 5 (val loss: 0.203)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.86it/s]
EVALUATION METRICS: FOLD_2
              precision    recall  f1-score   support

     Exclude      0.870     0.645     0.741        62
     Include      0.919     0.976     0.947       254

    accuracy                          0.911       316
   macro avg      0.894     0.811     0.844       316
weighted avg      0.909     0.911     0.906       316

Key Metrics:
  accuracy: 0.911
  precision: 0.919
  recall: 0.976
  specificity: 0.645
  f1_score: 0.947
  roc_auc: 0.896

Confusion matrix saved: files/cv/fold_2/confusion_matrix_fold_2.png
ROC curve saved: files/cv/fold_2/roc_curve_fold_2.png
Fold 2 — F1: 0.947 | Precision: 0.919 | Recall: 0.976

============================================================
Fold 3/5
============================================================
Train: 1264 | Val: 316
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00,  6.08it/s, loss=0.274]
Epoch: 0001/0005 | Train Loss: 0.274 | Val Loss: 0.263 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.256]
Epoch: 0002/0005 | Train Loss: 0.256 | Val Loss: 0.242 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 1.27 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.242]
Epoch: 0003/0005 | Train Loss: 0.242 | Val Loss: 0.235 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.235]
Epoch: 0004/0005 | Train Loss: 0.235 | Val Loss: 0.231 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.227]
Epoch: 0005/0005 | Train Loss: 0.227 | Val Loss: 0.228 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 3.16 min

Loading best model from epoch 5
Best epoch 5 (val loss: 0.228)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.66it/s]
EVALUATION METRICS: FOLD_3
              precision    recall  f1-score   support

     Exclude      0.000     0.000     0.000        61
     Include      0.807     1.000     0.893       255

    accuracy                          0.807       316
   macro avg      0.403     0.500     0.447       316
weighted avg      0.651     0.807     0.721       316

Key Metrics:
  accuracy: 0.807
  precision: 0.807
  recall: 1.000
  specificity: 0.000
  f1_score: 0.893
  roc_auc: 0.940

Confusion matrix saved: files/cv/fold_3/confusion_matrix_fold_3.png
ROC curve saved: files/cv/fold_3/roc_curve_fold_3.png
Fold 3 — F1: 0.893 | Precision: 0.807 | Recall: 1.000

============================================================
Fold 4/5
============================================================
Train: 1264 | Val: 316
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00,  6.08it/s, loss=0.259]
Epoch: 0001/0005 | Train Loss: 0.259 | Val Loss: 0.240 *** Best ***
Train Acc: 91.535% | Val Acc: 89.873% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.234]
Epoch: 0002/0005 | Train Loss: 0.234 | Val Loss: 0.226 *** Best ***
Train Acc: 92.880% | Val Acc: 90.506% | 1.27 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.22]
Epoch: 0003/0005 | Train Loss: 0.220 | Val Loss: 0.218 *** Best ***
Train Acc: 93.513% | Val Acc: 90.190% | 1.90 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.212]
Epoch: 0004/0005 | Train Loss: 0.212 | Val Loss: 0.213 *** Best ***
Train Acc: 93.671% | Val Acc: 90.506% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.206]
Epoch: 0005/0005 | Train Loss: 0.206 | Val Loss: 0.212 *** Best ***
Train Acc: 93.908% | Val Acc: 89.873% | 3.16 min

Loading best model from epoch 5
Best epoch 5 (val loss: 0.212)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.89it/s]
EVALUATION METRICS: FOLD_4
              precision    recall  f1-score   support

     Exclude      0.822     0.607     0.698        61
     Include      0.911     0.969     0.939       255

    accuracy                          0.899       316
   macro avg      0.867     0.788     0.819       316
weighted avg      0.894     0.899     0.893       316

Key Metrics:
  accuracy: 0.899
  precision: 0.911
  recall: 0.969
  specificity: 0.607
  f1_score: 0.939
  roc_auc: 0.848

Confusion matrix saved: files/cv/fold_4/confusion_matrix_fold_4.png
ROC curve saved: files/cv/fold_4/roc_curve_fold_4.png
Fold 4 — F1: 0.939 | Precision: 0.911 | Recall: 0.969

============================================================
Fold 5/5
============================================================
Train: 1264 | Val: 316
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Epoch 1/5: 100%|██████████| 158/158 [00:25<00:00,  6.08it/s, loss=0.261]
Epoch: 0001/0005 | Train Loss: 0.261 | Val Loss: 0.248 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 0.63 min
Epoch 2/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.248]
Epoch: 0002/0005 | Train Loss: 0.248 | Val Loss: 0.247 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 1.26 min
Epoch 3/5: 100%|██████████| 158/158 [00:25<00:00,  6.10it/s, loss=0.237]
Epoch: 0003/0005 | Train Loss: 0.237 | Val Loss: 0.242 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 1.89 min
Epoch 4/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.225]
Epoch: 0004/0005 | Train Loss: 0.225 | Val Loss: 0.233 *** Best ***
Train Acc: 80.538% | Val Acc: 80.696% | 2.53 min
Epoch 5/5: 100%|██████████| 158/158 [00:25<00:00,  6.09it/s, loss=0.214]
Epoch: 0005/0005 | Train Loss: 0.214 | Val Loss: 0.240
Train Acc: 80.538% | Val Acc: 80.696% | 3.16 min

Loading best model from epoch 4
Best epoch 4 (val loss: 0.233)
Evaluating: 100%|██████████| 40/40 [00:02<00:00, 19.76it/s]
EVALUATION METRICS: FOLD_5
              precision    recall  f1-score   support

     Exclude      0.000     0.000     0.000        61
     Include      0.807     1.000     0.893       255

    accuracy                          0.807       316
   macro avg      0.403     0.500     0.447       316
weighted avg      0.651     0.807     0.721       316

Key Metrics:
  accuracy: 0.807
  precision: 0.807
  recall: 1.000
  specificity: 0.000
  f1_score: 0.893
  roc_auc: 0.902

Confusion matrix saved: files/cv/fold_5/confusion_matrix_fold_5.png
ROC curve saved: files/cv/fold_5/roc_curve_fold_5.png
Fold 5 — F1: 0.893 | Precision: 0.807 | Recall: 1.000

============================================================
Cross-Validation Summary (5 folds)
============================================================
accuracy: 0.870 ± 0.052
precision: 0.874 ± 0.055
recall: 0.987 ± 0.013
specificity: 0.386 ± 0.316
f1_score: 0.926 ± 0.027
roc_auc: 0.901 ± 0.030
Best fold: 1 (val acc: 92.722%)

Mean F1: 0.926 ± 0.027
Mean ROC AUC: 0.901 ± 0.030
Calibrate model probabilities via temperature scaling.¶
In [14]:
calibration = calibrate_model(model, validation_dataloader, device)
print(f"Temperature: {calibration.temperature:.3f}")
Temperature: 0.941
Filter rows where "technology_use" is NA (not labeled).¶
In [15]:
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='huggingface_hub')

unlabeled_data = pd.read_excel("files/all_data_single_case.xlsx")

unlabeled_df = unlabeled_data[['UT', 'single_case', 'technology_use', 'SO', 'PY', 'AF', 'TI', 'AB', 'DE']].copy()
unlabeled_df["PY"] = pd.to_numeric(unlabeled_df["PY"], errors="coerce")

missing_values = unlabeled_df.isnull().sum()
print(missing_values)
UT                   0
single_case          0
technology_use    4349
SO                   0
PY                   0
AF                   0
TI                   0
AB                 129
DE                 912
dtype: int64
Load the BERT tokenizer.¶
In [16]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
Preprocess and create dataloaders.¶
In [17]:
column_specifications = {
    "text_cols": ['AF', 'TI', 'AB', 'DE'],
    "categorical_cols": ['SO'],
    "numeric_cols": ['PY'],
    "label_col": "technology_use",
}

numeric_transform = {'PY': 'min'}
label_col = column_specifications['label_col']

# Apply fitted transforms from training (no data leakage)
unlabeled_dataset, _ = preprocess_dataset(
    unlabeled_df, tokenizer, device, column_specifications, numeric_transform,
    max_length=config.max_length, fitted_transforms=fitted,
)
unlabeled_dataloader = create_dataloader(unlabeled_dataset, SequentialSampler, config.eval_batch_size)
Load and initialize the PubMLP model.¶
In [18]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

model = PubMLP(
    categorical_vocab_sizes=fitted.categorical_vocab_sizes,
    numeric_cols_num=1,
    mlp_hidden_size=config.mlp_hidden_size,
    output_size=1,
    dropout_rate=config.dropout_rate,
    embedding_model=config.embedding_model,
    model_name=config.model_name,
    n_hidden_layers=config.n_hidden_layers,
).to(device)

logging.getLogger("transformers").setLevel(logging.WARNING)

model.load_state_dict(torch.load(f"best_model_state_{label_col}.pth", map_location=device))
Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]
Out[18]:
<All keys matched successfully>
Predict each publication.¶
In [19]:
predictions, probabilities = predict_model(model, unlabeled_dataloader, device, calibration=calibration)

uncertain = flag_uncertain(probabilities, low=config.uncertainty_low, high=config.uncertainty_high)
print(f"Uncertain predictions ({config.uncertainty_low}-{config.uncertainty_high}): {sum(uncertain)} / {len(uncertain)}")

predictions_data = pd.DataFrame({
    label_col: ['Yes' if p == 1 else 'No' for p in predictions],
    'probability': probabilities,
    'uncertain': uncertain,
})

if len(predictions_data) == len(unlabeled_df):
    unlabeled_df[label_col] = predictions_data[label_col].values
    unlabeled_df['probability'] = predictions_data['probability'].values
    unlabeled_df['uncertain'] = predictions_data['uncertain'].values
    predicted_data = unlabeled_df
else:
    print("The number of rows in predictions_df and unlabeled_df does not match.")

predicted_file = f"files/predicted_data_{label_col}.xlsx"
predicted_data.to_excel(predicted_file, index=False)
print(f"Data saved to {predicted_file}")

screened_pubs = (predicted_data[label_col] == 'Yes').sum()
print(f"'Yes' predictions for {label_col}: {screened_pubs}")
Predicting: 100%|██████████| 544/544 [00:27<00:00, 19.62it/s]
Uncertain predictions (0.3-0.7): 1855 / 4349
Data saved to files/predicted_data_technology_use.xlsx
'Yes' predictions for technology_use: 2540
In [20]:
merged_data = pd.concat([labeled_data, predicted_data], ignore_index=True, sort=False)

init_all_data = pd.read_csv("files/init_all_data.csv")

all_data = pd.merge(init_all_data, merged_data, on="UT", how="left", suffixes=("", "_y"))
all_data = all_data.loc[:, ~all_data.columns.str.endswith("_y")]
all_data.columns = all_data.columns.str.replace("_x$", "", regex=True)

all_data_file = f"files/all_data_{label_col}.xlsx"
all_data.to_excel(all_data_file, index=False)
print(f"Data saved to {all_data_file}")

all_data['included'] = all_data.apply(
    lambda row: 'Yes' if row['single_case'] == 'Yes' and row['technology_use'] == 'Yes' else 'No', axis=1
)
included_pubs = (all_data['included'] == 'Yes').sum()
print(f"Included studies (single_case=Yes AND technology_use=Yes): {included_pubs}")
Data saved to files/all_data_technology_use.xlsx
Included studies (single_case=Yes AND technology_use=Yes): 3677
In [21]:
from nbconvert import HTMLExporter
import nbformat

notebook_path = 'index.ipynb'
html_exporter = HTMLExporter()

with open(notebook_path, 'r', encoding='utf-8') as nb_file:
    notebook_content = nb_file.read()
    notebook = nbformat.reads(notebook_content, as_version=4)

if 'widgets' in notebook.metadata and 'application/vnd.jupyter.widget-state+json' in notebook.metadata['widgets']:
    if 'state' not in notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']:
        notebook.metadata['widgets']['application/vnd.jupyter.widget-state+json']['state'] = {}

html_output, _ = html_exporter.from_notebook_node(notebook)

with open('index.html', 'w', encoding='utf-8') as html_file:
    html_file.write(html_output)