Screening Workflow#
End-to-end example of using pubmlp to screen bibliometric records for a systematic review. Two approaches are shown:
Approach A — Sequential single-label: Train one model per screening criterion. Best when criteria have very different class distributions or when you need separate tuning per label.
Approach B — Simultaneous multi-label: Train one model that predicts all criteria at once. Best when criteria share signal from the same text, reducing total training time.
Both approaches use multiple categorical and numeric columns.
Approach A — Sequential Single-Label#
Train a separate model for each screening criterion. This is the approach when each label needs independent tuning or has very different characteristics.
A1. Single-Case Classification#
col_spec_sc = {
"text_cols": ["AF", "TI", "AB", "DE"],
"categorical_cols": ["SO", "DT"],
"numeric_cols": ["PY", "TC"],
"label_col": "single_case", # single string = single-label
}
# Fit on training data
train_dataset_sc, fitted_sc = preprocess_dataset(
train_df, tokenizer, device, col_spec_sc, numeric_transform
)
# Apply to val/test — no data leakage
val_dataset_sc, _ = preprocess_dataset(
val_df, tokenizer, device, col_spec_sc, numeric_transform,
fitted_transforms=fitted_sc
)
test_dataset_sc, _ = preprocess_dataset(
test_df, tokenizer, device, col_spec_sc, numeric_transform,
fitted_transforms=fitted_sc
)
train_loader_sc = create_dataloader(train_dataset_sc, RandomSampler, config.batch_size)
val_loader_sc = create_dataloader(val_dataset_sc, SequentialSampler, config.eval_batch_size)
test_loader_sc = create_dataloader(test_dataset_sc, SequentialSampler, config.eval_batch_size)
model_sc = PubMLP(
categorical_vocab_sizes=fitted_sc.categorical_vocab_sizes,
numeric_cols_num=2,
mlp_hidden_size=config.mlp_hidden_size,
output_size=1,
dropout_rate=config.dropout_rate,
embedding_model=config.embedding_model,
model_name=config.model_name,
).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model_sc.parameters(), lr=config.learning_rate)
results_sc = train_evaluate_model(
model_sc, train_loader_sc, val_loader_sc, test_loader_sc,
optimizer, criterion, device, config.epochs,
early_stopping_patience=config.early_stopping_patience,
)
preds, probs, labels = get_predictions_and_labels(model_sc, test_loader_sc, device)
metrics_sc = calculate_evaluation_metrics(labels, preds, probs, label_name="single_case")
torch.save(results_sc[6], "model_single_case.pth")
A2. Technology-Use Classification#
col_spec_tu = {
"text_cols": ["AF", "TI", "AB", "DE"],
"categorical_cols": ["SO", "DT"],
"numeric_cols": ["PY", "TC"],
"label_col": "technology_use", # different label, same features
}
train_dataset_tu, fitted_tu = preprocess_dataset(
train_df, tokenizer, device, col_spec_tu, numeric_transform
)
val_dataset_tu, _ = preprocess_dataset(
val_df, tokenizer, device, col_spec_tu, numeric_transform,
fitted_transforms=fitted_tu
)
test_dataset_tu, _ = preprocess_dataset(
test_df, tokenizer, device, col_spec_tu, numeric_transform,
fitted_transforms=fitted_tu
)
train_loader_tu = create_dataloader(train_dataset_tu, RandomSampler, config.batch_size)
val_loader_tu = create_dataloader(val_dataset_tu, SequentialSampler, config.eval_batch_size)
test_loader_tu = create_dataloader(test_dataset_tu, SequentialSampler, config.eval_batch_size)
model_tu = PubMLP(
categorical_vocab_sizes=fitted_tu.categorical_vocab_sizes,
numeric_cols_num=2,
mlp_hidden_size=config.mlp_hidden_size,
output_size=1,
dropout_rate=config.dropout_rate,
embedding_model=config.embedding_model,
model_name=config.model_name,
).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model_tu.parameters(), lr=config.learning_rate)
results_tu = train_evaluate_model(
model_tu, train_loader_tu, val_loader_tu, test_loader_tu,
optimizer, criterion, device, config.epochs,
early_stopping_patience=config.early_stopping_patience,
)
preds, probs, labels = get_predictions_and_labels(model_tu, test_loader_tu, device)
metrics_tu = calculate_evaluation_metrics(labels, preds, probs, label_name="technology_use")
torch.save(results_tu[6], "model_technology_use.pth")
A3. Predict Unlabeled (Sequential)#
unlabeled_df = pd.read_excel("unlabeled_data.xlsx")
unlabeled_df["PY"] = pd.to_numeric(unlabeled_df["PY"], errors="coerce")
unlabeled_df["TC"] = pd.to_numeric(unlabeled_df["TC"], errors="coerce")
# Predict single_case
unlabeled_ds_sc, _ = preprocess_dataset(
unlabeled_df, tokenizer, device, col_spec_sc, numeric_transform,
fitted_transforms=fitted_sc
)
loader_sc = create_dataloader(unlabeled_ds_sc, SequentialSampler, config.eval_batch_size)
model_sc.load_state_dict(torch.load("model_single_case.pth", map_location=device))
preds_sc, probs_sc = predict_model(model_sc, loader_sc, device)
# Predict technology_use
unlabeled_ds_tu, _ = preprocess_dataset(
unlabeled_df, tokenizer, device, col_spec_tu, numeric_transform,
fitted_transforms=fitted_tu
)
loader_tu = create_dataloader(unlabeled_ds_tu, SequentialSampler, config.eval_batch_size)
model_tu.load_state_dict(torch.load("model_technology_use.pth", map_location=device))
preds_tu, probs_tu = predict_model(model_tu, loader_tu, device)
unlabeled_df["single_case"] = ["Yes" if p == 1 else "No" for p in preds_sc]
unlabeled_df["single_case_prob"] = probs_sc
unlabeled_df["technology_use"] = ["Yes" if p == 1 else "No" for p in preds_tu]
unlabeled_df["technology_use_prob"] = probs_tu
unlabeled_df.to_excel("predicted_sequential.xlsx", index=False)
Approach B — Simultaneous Multi-Label#
Train one model that predicts all criteria at once. Shared encoder representations can improve performance when the labels are related.
B1. Preprocess with Multi-Label#
col_spec_multi = {
"text_cols": ["AF", "TI", "AB", "DE"],
"categorical_cols": ["SO", "DT"],
"numeric_cols": ["PY", "TC"],
"label_col": ["single_case", "technology_use"], # list = multi-label
}
train_dataset, fitted = preprocess_dataset(
train_df, tokenizer, device, col_spec_multi, numeric_transform
)
val_dataset, _ = preprocess_dataset(
val_df, tokenizer, device, col_spec_multi, numeric_transform,
fitted_transforms=fitted
)
test_dataset, _ = preprocess_dataset(
test_df, tokenizer, device, col_spec_multi, numeric_transform,
fitted_transforms=fitted
)
train_loader = create_dataloader(train_dataset, RandomSampler, config.batch_size)
val_loader = create_dataloader(val_dataset, SequentialSampler, config.eval_batch_size)
test_loader = create_dataloader(test_dataset, SequentialSampler, config.eval_batch_size)
B2. Train Multi-Label Model#
model = PubMLP(
categorical_vocab_sizes=fitted.categorical_vocab_sizes,
numeric_cols_num=2,
mlp_hidden_size=config.mlp_hidden_size,
output_size=2, # one output per label
dropout_rate=config.dropout_rate,
embedding_model=config.embedding_model,
model_name=config.model_name,
).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=config.learning_rate)
(train_losses, val_losses,
train_accs, val_accs,
test_acc, best_val_loss,
best_model_state, best_epoch) = train_evaluate_model(
model, train_loader, val_loader, test_loader,
optimizer, criterion, device, config.epochs,
early_stopping_patience=config.early_stopping_patience,
)
torch.save(best_model_state, "model_multilabel.pth")
plot_results(train_losses, val_losses, train_accs, val_accs,
test_acc, best_val_loss, best_epoch)
B3. Evaluate Per-Label Metrics#
preds, probs, labels = get_predictions_and_labels(model, test_loader, device)
# Per-label P/R/F1/AUC + macro F1 + hamming loss
metrics = calculate_evaluation_metrics(
labels, preds, probs,
label_names=["single_case", "technology_use"],
save_figures=True,
output_dir="figures/",
)
# Access per-label results
for lname, lmetrics in metrics["per_label"].items():
print(f'{lname}: F1={lmetrics["f1_score"]:.3f} AUC={lmetrics["roc_auc"]:.3f}')
print(f'Macro F1: {metrics["macro_f1"]:.3f}')
print(f'Hamming Loss: {metrics["hamming_loss"]:.3f}')
B4. Predict Unlabeled (Multi-Label)#
unlabeled_df = pd.read_excel("unlabeled_data.xlsx")
unlabeled_df["PY"] = pd.to_numeric(unlabeled_df["PY"], errors="coerce")
unlabeled_df["TC"] = pd.to_numeric(unlabeled_df["TC"], errors="coerce")
# Dummy label columns for preprocessing (values ignored during prediction)
for col in ["single_case", "technology_use"]:
unlabeled_df[col] = 0
unlabeled_dataset, _ = preprocess_dataset(
unlabeled_df, tokenizer, device, col_spec_multi, numeric_transform,
fitted_transforms=fitted
)
unlabeled_loader = create_dataloader(unlabeled_dataset, SequentialSampler, config.eval_batch_size)
model.load_state_dict(torch.load("model_multilabel.pth", map_location=device))
predictions, probabilities = predict_model(model, unlabeled_loader, device)
# Multi-label: predictions and probabilities are list of lists
uncertain = flag_uncertain(probabilities)
print(f"Records with any uncertain label: {sum(any(u) for u in uncertain)} / {len(uncertain)}")
label_names = ["single_case", "technology_use"]
for i, col in enumerate(label_names):
unlabeled_df[col] = ["Yes" if p[i] == 1 else "No" for p in predictions]
unlabeled_df[f"{col}_prob"] = [p[i] for p in probabilities]
unlabeled_df[f"{col}_uncertain"] = [u[i] for u in uncertain]
unlabeled_df.to_excel("predicted_multilabel.xlsx", index=False)
Which Approach to Choose?#
Sequential (A) |
Simultaneous (B) |
|
|---|---|---|
When to use |
Different class distributions per label, need separate hyperparameter tuning |
Labels share text signal, want faster total training |
Models trained |
One per label |
One for all labels |
Metrics |
Standard per-model metrics |
Per-label + macro F1 + hamming loss |
Flexibility |
Can use different configs per label |
Shared architecture and training |
Saving/loading |
Save one model + fitted_transforms per label |
Save one model + one fitted_transforms |
Active Learning#
Use active learning to iteratively label the most informative records, reducing total screening effort.
Import#
from pubmlp import (
ALState, select_query_batch, create_review_batch,
compare_reviewers, merge_human_labels, simulate_al,
rank_by_hybrid_max_uncertainty, rank_by_hybrid_max_random,
)
Select and Review a Batch#
# After running predict_model on unlabeled data
query_indices = select_query_batch(probs_sc, strategy='hybrid_max_uncertainty', batch_size=20)
review_batch = create_review_batch(unlabeled_df, query_indices, probs_sc)
review_batch.to_excel("review_batch.xlsx", index=False)
# After human review — load back and compare
reviewed = pd.read_excel("review_batch.xlsx")
agreement = compare_reviewers(reviewed["model_prediction"], reviewed["human_label"])
print(f'Agreement: {agreement["agreement_rate"]:.2%}, Kappa: {agreement["kappa"]:.3f}')
# Merge human labels back
unlabeled_df = merge_human_labels(unlabeled_df, reviewed, label_col="human_label")
Track State Across Iterations#
state = ALState(
labeled_indices=list(range(len(train_df))),
unlabeled_indices=list(range(len(unlabeled_df))),
)
# After each iteration, update state
state.labeled_indices.extend(query_indices.tolist())
state.unlabeled_indices = [i for i in state.unlabeled_indices if i not in set(query_indices.tolist())]
state.iteration += 1
Offline Simulation#
def model_fn(train_df, unlabeled_df):
# Train model on train_df, predict on unlabeled_df
# Return probabilities as a list
...
history = simulate_al(
df, label_col="single_case", model_fn=model_fn,
strategy="hybrid_max_uncertainty", batch_size=20,
initial_pct=0.1, max_iterations=50,
)
for h in history[-3:]:
print(f'Iteration {h["iteration"]}: recall={h["recall"]:.2%}, screened={h["screened_pct"]:.1%}')
Available Strategies#
Strategy |
Description |
|---|---|
|
Most uncertain (closest to 0.5) first |
|
Random order |
|
Highest probability first |
|
95% max-relevance + 5% uncertainty |
|
95% max-relevance + 5% random |