Addressing the void of AI policies in education for students with specific learning disabilities

Published

January 3, 2026

The website contains supplemental materials and code used to analyze text data in Shin et al. (2025).

Shin, M., Deniz, F., Watson, L., Dieterich, C., Ewoldt, K., Johnson, F., Kong, J., Lee, S. H., & Whitehurst, A. (in press). Addressing the void of AI policies in education for students with specific learning disabilities. Learning Disability Quarterly.

Flowchart of Data Collection and Analysis

  %%{init: {
    "flowchart": {"rankSpacing": 24},
    "themeVariables": {"fontSize": "17px", "lineHeight": 1.1, "nodePadding": 6}
  }}%%
flowchart TD
    n1["Policy Documents Analysis<br/>N = 12 Documents"] --> D["Document Categorization"]
    D --> E["Text Preprocessing"]
    E --> F["Text Embeddings<br/>Sentence-BERT"] & G["Topic Modeling<br/>Structural Topic Model"]
    F --> H["Similarity Analysis<br/>Cosine Similarity"]
    G --> I["Topic Label Generation<br/>OpenAI's API"]
    I --> V1["Research Team Validation<br/>Topic Labels"]
    F --> V1
    H --> P["Theme Comparison<br/>• SLD vs Other Disability<br/>• SLD vs General Policies"]
    P --> Q["Gap Identification"]
    n1 --> TP2["Text Preprocessing"]
    TP2 --> TM2["Topic Modeling<br/>Structural Topic Model"]
    TM2 --> J["Survey Item Generation<br/>OpenAI's API"]
    J --> V3["Research Team Validation<br/>Survey Items"]
    V3 --> B["Delphi Round 1<br/>N=17 Experts"]
    B --> C["Delphi Round 2<br/>N=14 Experts"] & K1["Expert Validation<br/>After Delphi Round 1"]
    C --> K2["Expert Validation<br/>After Delphi Round 2"]
    K1 --> L["Content Validity<br/>Lawshe's CVR Method"]
    K2 --> L
    L --> STM3["Topic Modeling<br/>Structural Topic Model"]
    STM3 --> M["Final Policy Items<br/>N=36 Items"]
    M --> T["Theme Generation<br/>OpenAI's ChatGPT"]
    T --> V2["Research Team Validation<br/>Theme Labels"]
    
    classDef dataPhase fill:#f8f9ff,stroke:#4a5568,stroke-width:2px,color:#2d3748
    classDef processPhase fill:#f7fdf7,stroke:#38a169,stroke-width:2px,color:#2d3748
    classDef analysisPhase fill:#fffaf0,stroke:#ed8936,stroke-width:2px,color:#2d3748
    classDef validationPhase fill:#fef5f7,stroke:#d53f8c,stroke-width:2px,color:#2d3748
    classDef researchValidation fill:#f0f4ff,stroke:#6366f1,stroke-width:2px,color:#2d3748
    classDef finalPhase fill:#e6fffa,stroke:#319795,stroke-width:2px,color:#2d3748
    
    n1:::processPhase
    D:::processPhase
    B:::dataPhase
    C:::dataPhase
    K1:::validationPhase
    E:::processPhase
    F:::analysisPhase
    G:::analysisPhase
    H:::analysisPhase
    I:::analysisPhase
    T:::analysisPhase
    TP2:::processPhase
    TM2:::analysisPhase
    V1:::researchValidation
    V2:::researchValidation
    V3:::researchValidation
    P:::analysisPhase
    Q:::analysisPhase
    J:::analysisPhase
    K2:::validationPhase
    L:::validationPhase
    STM3:::analysisPhase
    M:::finalPhase

Load R Packages

# Reset any time limits from previous session state
setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)

suppressPackageStartupMessages({
    library(tidyr)
    library(dplyr)
    library(readxl)
    library(gtsummary)
    library(officer)
    library(flextable)
    library(ggplot2)
    library(quanteda)
    library(reticulate)
    library(TextAnalysisR)
    library(htmlwidgets)
    library(stm)
    library(dotenv)
    library(ggdendro)
    library(plotly)
    library(httr)
    library(jsonlite)
    library(progress)
    library(openxlsx)
    library(stringr)
    library(purrr)
    library(text2vec)
    library(stringdist)
    library(DT)
})

Sys.setenv(HF_HUB_DISABLE_SYMLINKS_WARNING = "1")
Sys.setenv(TOKENIZERS_PARALLELISM = "false")
options(timeout = 600)
setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)

Load Dataset

load("data/ai_ld_data.rda")

Document-Level Analysis

Code

data <- read_excel("data/policyData.xlsx")
data_long <- data |>
  pivot_longer(
    cols           = matches("^coding_\\d+$"),
    names_to       = "coding_id",
    names_prefix   = "coding_",
    values_to      = "coding_text",
    values_drop_na = TRUE
  ) |>
  mutate(
    coding_id   = as.integer(coding_id),
    document_id = paste(ID, coding_id, sep = "_")
 )


united_tbl<- TextAnalysisR::unite_cols(data_long, listed_vars = "coding_text")

documents <- data_long %>%
    group_by(ID, doc_name, type, disability, SLD) %>%
    summarise(
        document_text = paste(coding_text, collapse = " "),
        n_chunks = n(),
        .groups = 'drop'
    ) %>%
    mutate(
        category = case_when(
            disability == "Yes" & SLD == "Yes" ~ "SLD",
            disability == "Yes" & SLD == "No" ~ "Other Disability", 
            disability == "No" & SLD == "No" ~ "General",
            TRUE ~ "Uncategorized"
        ),
        display_name = paste("Doc", ID),
        doc_numeric_id = as.numeric(str_extract(display_name, "\\d+"))
    ) %>%
    arrange(doc_numeric_id)

documents_filtered <- documents %>%
    mutate(
        doc_numeric_id = as.numeric(str_extract(display_name, "\\d+"))
    ) %>%
    arrange(doc_numeric_id)

if(nrow(documents_filtered) == 0) {
    stop("No documents remain after filtering! Check document lengths and categories.")
}

document_summary <- documents_filtered %>%
    select(
        Document = display_name,
        `Document Name` = doc_name,
        Type = type,
        Disability = disability,
        SLD = SLD,
        Category = category,
        `Coding Units` = n_chunks
    )

Code

# setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
documents <- data_long %>%
  group_by(ID, doc_name, type, disability, SLD) %>%
  summarise(
    document_text = paste(coding_text, collapse = " "),
    n_chunks = n(),
    .groups = 'drop'
  ) %>%
  mutate(
    category = case_when(
      disability == "Yes" & SLD == "Yes" ~ "SLD",
      disability == "Yes" & SLD == "No" ~ "Other Disability", 
      disability == "No" & SLD == "No" ~ "General",
      TRUE ~ "Uncategorized"
    ),
    display_name = paste("Doc", ID),
    doc_numeric_id = as.numeric(str_extract(display_name, "\\d+"))
  ) %>%
  arrange(doc_numeric_id)

if(nrow(documents) == 0) {
  stop("No documents remain after filtering! Check document lengths and categories.")
}

# Generate embeddings for all documents using TextAnalysisR
all_embeddings <- TextAnalysisR::generate_embeddings(
  texts = documents$document_text,
  model = "all-MiniLM-L6-v2",
  verbose = TRUE
)

# Calculate full similarity matrix using TextAnalysisR
similarity_result <- TextAnalysisR::calculate_document_similarity(
  texts = documents$document_text,
  document_feature_type = "embeddings",
  similarity_method = "cosine",
  embedding_model = "all-MiniLM-L6-v2",
  verbose = TRUE
)

full_sim_matrix <- similarity_result$similarity_matrix

# Create document metadata for filtering
doc_metadata <- documents %>%
  mutate(row_idx = row_number()) %>%
  select(row_idx, display_name, ID, doc_name, category)

# Extract cross-category similarities (SLD vs Other categories)
similarity_threshold <- 0.6

sld_indices <- doc_metadata %>% filter(category == "SLD") %>% pull(row_idx)
other_disability_indices <- doc_metadata %>% filter(category == "Other Disability") %>% pull(row_idx)
general_indices <- doc_metadata %>% filter(category == "General") %>% pull(row_idx)

# Build ld_similarities by extracting from full matrix
ld_similarities <- bind_rows(
  # SLD vs Other Disability
  expand_grid(
    ld_idx = sld_indices,
    other_idx = other_disability_indices
  ) %>%
    mutate(
      cosine_similarity = map2_dbl(ld_idx, other_idx, ~full_sim_matrix[.x, .y]),
      other_category = "Other Disability"
    ),
  # SLD vs General
  expand_grid(
    ld_idx = sld_indices,
    other_idx = general_indices
  ) %>%
    mutate(
      cosine_similarity = map2_dbl(ld_idx, other_idx, ~full_sim_matrix[.x, .y]),
      other_category = "General"
    )
) %>%
  left_join(doc_metadata %>% select(row_idx, display_name, ID, doc_name) %>%
              rename(ld_doc_name = display_name, ld_doc_id = ID, ld_actual_name = doc_name),
            by = c("ld_idx" = "row_idx")) %>%
  left_join(doc_metadata %>% select(row_idx, display_name, ID, doc_name) %>%
              rename(other_doc_name = display_name, other_doc_id = ID, other_actual_name = doc_name),
            by = c("other_idx" = "row_idx")) %>%
  select(-ld_idx, -other_idx) %>%
  mutate(
    other_category = factor(other_category, levels = c("Other Disability", "General"))
  )

similarity_heatmap <- TextAnalysisR::plot_cross_category_heatmap(
  similarity_data = ld_similarities,
  row_var = "ld_doc_name",
  col_var = "other_doc_name",
  value_var = "cosine_similarity",
  category_var = "other_category",
  row_display_var = "ld_actual_name",
  col_display_var = "other_actual_name",
  row_label = "SLD Documents"
)

similarity_heatmap_plotly <- similarity_heatmap %>% 
  ggplotly(tooltip = "text", width = 800, height = 500) %>% 
  layout(hovermode = "closest") %>%
  style(hoverinfo = "text")

Document-Level Similarity Heatmap

Document Embedding Similarity Statistics Table

Code

similarity_stats <- ld_similarities %>%
    group_by(other_category) %>%
    summarise(
        mean_similarity = round(mean(cosine_similarity), 3),
        median_similarity = round(median(cosine_similarity), 3),
        sd_similarity = round(sd(cosine_similarity), 3),
        min_similarity = round(min(cosine_similarity), 3),
        max_similarity = round(max(cosine_similarity), 3),
        .groups = 'drop'
    )

doc_gap_analysis <- TextAnalysisR::analyze_similarity_gaps(
    similarity_data = ld_similarities,
    ref_var = "ld_doc_name",
    other_var = "other_doc_name",
    similarity_var = "cosine_similarity",
    category_var = "other_category",
    unique_threshold = similarity_threshold,
    cross_policy_min = similarity_threshold,
    cross_policy_max = 0.8
)

LD_Unique <- doc_gap_analysis$unique_items %>%
    rename(match_doc = best_match, match_cat = best_match_category) %>%
    mutate(
        doc_numeric_id = as.numeric(str_extract(ld_doc_name, "\\d+")),
        gap_type = "LD_Unique"
    ) %>%
    arrange(doc_numeric_id)

missing_from_ld <- doc_gap_analysis$missing_items %>%
    rename(match_sld = best_match) %>%
    mutate(
        doc_numeric_id = as.numeric(str_extract(other_doc_name, "\\d+")),
        gap_type = "Missing_from_SLD"
    ) %>%
    arrange(doc_numeric_id)

cross_policy <- doc_gap_analysis$cross_policy %>%
    rename(match_doc = best_match, match_cat = best_match_category) %>%
    mutate(
        doc_numeric_id = as.numeric(str_extract(ld_doc_name, "\\d+")),
        gap_type = "Cross-Policy_Document"
    ) %>%
    arrange(doc_numeric_id)

Unique SLD Policy Documents

Lower similarity scores indicate greater uniqueness

Missing from SLD Policy Documents

Lower similarity scores indicate content missing from SLD policies

Cross Policy

Similarity scores between 0.6-0.8 indicate cross-policy learning potential

Code

cross_policy %>%
    select(ld_doc_name, min_similarity, max_similarity, match_doc, match_cat, gap_type) %>%
    DT::datatable(
        options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'pdf'),
            pageLength = 10,
            scrollX = TRUE
        ),
        extensions = 'Buttons',
        class = "cell-border stripe",
        rownames = FALSE
    ) %>%
    DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3)

Topic-Level Analysis

Code

# setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
data <- data %>%
  mutate(
    category = case_when(
      disability == "Yes" & SLD == "Yes" ~ "SLD",
      disability == "Yes" & SLD == "No" ~ "Other Disability", 
      disability == "No" & SLD == "No" ~ "General",
      TRUE ~ "Uncategorized"
    )
  )

ld_docs <- data %>% 
  filter(category == "SLD") %>%
  pivot_longer(
    cols = matches("^coding_\\d+$"),
    names_to = "coding_id",
    names_prefix = "coding_",
    values_to = "coding_text",
    values_drop_na = TRUE
  ) %>%
  mutate(
    coding_id = as.integer(coding_id),
    document_id = paste(ID, coding_id, sep = "_")
  )

disability_docs <- data %>% 
  filter(category == "Other Disability") %>%
  pivot_longer(
    cols = matches("^coding_\\d+$"),
    names_to = "coding_id",
    names_prefix = "coding_",
    values_to = "coding_text",
    values_drop_na = TRUE
  ) %>%
  mutate(
    coding_id = as.integer(coding_id),
    document_id = paste(ID, coding_id, sep = "_")
  )

general_docs <- data %>% 
  filter(category == "General") %>%
  pivot_longer(
    cols = matches("^coding_\\d+$"),
    names_to = "coding_id",
    names_prefix = "coding_",
    values_to = "coding_text",
    values_drop_na = TRUE
  ) %>%
  mutate(
    coding_id = as.integer(coding_id),
    document_id = paste(ID, coding_id, sep = "_")
  )

common_words <- c("na", "may", "can", "[''']s")

united_tbl_ld <- TextAnalysisR::unite_cols(ld_docs, listed_vars = "coding_text")
tokens_ld <- TextAnalysisR::prep_texts(
  united_tbl_ld,
  text_field = "united_texts",
  remove_stopwords = TRUE,
  custom_stopwords = common_words,
  custom_valuetype = "regex",
  verbose = FALSE
)
toks_lemmatized_ld <- quanteda::tokens_wordstem(tokens_ld)
dfm_ld <- dfm(toks_lemmatized_ld)

united_tbl_disability <- TextAnalysisR::unite_cols(disability_docs, listed_vars = "coding_text")
tokens_disability <- TextAnalysisR::prep_texts(
  united_tbl_disability,
  text_field = "united_texts",
  remove_stopwords = TRUE,
  custom_stopwords = common_words,
  custom_valuetype = "regex",
  verbose = FALSE
)
toks_lemmatized_disability <- quanteda::tokens_wordstem(tokens_disability)
dfm_disability <- dfm(toks_lemmatized_disability)

united_tbl_general <- TextAnalysisR::unite_cols(general_docs, listed_vars = "coding_text")
tokens_general <- TextAnalysisR::prep_texts(
  united_tbl_general,
  text_field = "united_texts",
  remove_stopwords = TRUE,
  custom_stopwords = common_words,
  custom_valuetype = "regex",
  verbose = FALSE
)
toks_lemmatized_general <- quanteda::tokens_wordstem(tokens_general)
dfm_general <- dfm(toks_lemmatized_general)

out_ld <- quanteda::convert(dfm_ld, to = "stm")
topic_model_ld <- stm(
  data = out_ld$meta,
  documents = out_ld$documents,
  vocab = out_ld$vocab,
  max.em.its = 75,
  init.type = "Spectral",
  K = 10, 
  prevalence = ~ 1,
  verbose = FALSE,
  seed = 2025
)

out_disability <- quanteda::convert(dfm_disability, to = "stm")
topic_model_disability <- stm(
  data = out_disability$meta,
  documents = out_disability$documents,
  vocab = out_disability$vocab,
  max.em.its = 75,
  init.type = "Spectral",
  K = 10,  
  prevalence = ~ 1,
  verbose = FALSE,
  seed = 2025
)

out_general <- quanteda::convert(dfm_general, to = "stm")
topic_model_general <- stm(
  data = out_general$meta,
  documents = out_general$documents,
  vocab = out_general$vocab,
  max.em.its = 75,
  init.type = "Spectral",
  K = 10, 
  prevalence = ~ 1,
  verbose = FALSE,
  seed = 2025
)

top_terms_ld <- TextAnalysisR::get_topic_terms(topic_model_ld, top_term_n = 10, verbose = FALSE)
top_terms_disability <- TextAnalysisR::get_topic_terms(topic_model_disability, top_term_n = 10, verbose = FALSE)
top_terms_general <- TextAnalysisR::get_topic_terms(topic_model_general, top_term_n = 10, verbose = FALSE)

top_labeled_ld <- TextAnalysisR::generate_topic_labels(top_terms_ld, model = "gpt-4o", temperature = 0.5, verbose = FALSE)
top_labeled_disability <- TextAnalysisR::generate_topic_labels(top_terms_disability, model = "gpt-4o", temperature = 0.5, verbose = FALSE)
top_labeled_general <- TextAnalysisR::generate_topic_labels(top_terms_general, model = "gpt-4o", temperature = 0.5, verbose = FALSE)

top_labeled_ld$category <- "SLD"
top_labeled_disability$category <- "Other Disability"
top_labeled_general$category <- "General"

top_labeled_ld <- top_labeled_ld %>% mutate(topic = as.numeric(topic)) %>% arrange(topic)
top_labeled_disability <- top_labeled_disability %>% mutate(topic = as.numeric(topic)) %>% arrange(topic)
top_labeled_general <- top_labeled_general %>% mutate(topic = as.numeric(topic)) %>% arrange(topic)

gamma_ld <- TextAnalysisR::get_topic_prevalence(topic_model_ld, category = "SLD")
gamma_disability <- TextAnalysisR::get_topic_prevalence(topic_model_disability, category = "Other Disability")
gamma_general <- TextAnalysisR::get_topic_prevalence(topic_model_general, category = "General")

all_gamma <- bind_rows(gamma_ld, gamma_disability, gamma_general)

all_labeled_topics <- bind_rows(top_labeled_ld, top_labeled_disability, top_labeled_general) %>%
  left_join(all_gamma, by = c("topic", "category")) %>%
  mutate(
    topic_gamma = paste0("Topic ", topic, " (", round(gamma, 3), ")")
  ) %>%
  arrange(category, topic)  

all_labeled_topics <- all_labeled_topics %>%
  mutate(
    topic = as.numeric(topic),  
    topic_label_with_gamma = paste0(
      topic, ". ", topic_label, " (", round(gamma, 3), ")"
    )
  )

ld_topic_texts <- TextAnalysisR::get_topic_texts(top_terms_ld)
disability_topic_texts <- TextAnalysisR::get_topic_texts(top_terms_disability)
general_topic_texts <- TextAnalysisR::get_topic_texts(top_terms_general)

# Generate topic embeddings using TextAnalysisR
ld_topic_emb <- TextAnalysisR::generate_embeddings(ld_topic_texts, model = "all-MiniLM-L6-v2", verbose = FALSE)
disability_topic_emb <- TextAnalysisR::generate_embeddings(disability_topic_texts, model = "all-MiniLM-L6-v2", verbose = FALSE)
general_topic_emb <- TextAnalysisR::generate_embeddings(general_topic_texts, model = "all-MiniLM-L6-v2", verbose = FALSE)

# Calculate cross-category topic similarity using TextAnalysisR
ld_labels <- top_labeled_ld %>% distinct(topic, .keep_all = TRUE) %>% arrange(topic) %>% pull(topic_label)
disability_labels <- top_labeled_disability %>% distinct(topic, .keep_all = TRUE) %>% arrange(topic) %>% pull(topic_label)
general_labels <- top_labeled_general %>% distinct(topic, .keep_all = TRUE) %>% arrange(topic) %>% pull(topic_label)

ld_vs_disability_result <- TextAnalysisR::calculate_cross_similarity(
  ld_topic_emb, disability_topic_emb,
  labels1 = ld_labels, labels2 = disability_labels
)
ld_vs_disability_cosine <- ld_vs_disability_result$similarity_df %>%
  rename(topic1 = row_idx, topic2 = col_idx, cosine_similarity = similarity) %>%
  mutate(cat1 = "SLD", cat2 = "Other Disability", comparison = "SLD vs Other Disability")

ld_vs_general_result <- TextAnalysisR::calculate_cross_similarity(
  ld_topic_emb, general_topic_emb,
  labels1 = ld_labels, labels2 = general_labels
)
ld_vs_general_cosine <- ld_vs_general_result$similarity_df %>%
  rename(topic1 = row_idx, topic2 = col_idx, cosine_similarity = similarity) %>%
  mutate(cat1 = "SLD", cat2 = "General", comparison = "SLD vs General")

ld_focused_similarities <- bind_rows(
  ld_vs_disability_cosine,
  ld_vs_general_cosine
)

topic_similarities <- ld_focused_similarities %>%
  left_join(
    top_labeled_ld %>% 
      arrange(topic) %>%
      select(topic, topic_label) %>%
      rename(ld_topic_label = topic_label),
    by = c("topic1" = "topic")
  ) %>%
  mutate(
    ld_topic_name = paste("Topic", topic1),
    other_topic_name = paste("Topic", topic2),
    ld_topic_id = label1,
    other_topic_id = label2,
    other_category = cat2
  ) %>%
  left_join(
    top_labeled_disability %>% 
      arrange(topic) %>%
      select(topic, topic_label) %>%
      rename(disability_topic_label = topic_label),
    by = c("topic2" = "topic")
  ) %>%
  left_join(
    top_labeled_general %>% 
      arrange(topic) %>%
      select(topic, topic_label) %>%
      rename(general_topic_label = topic_label),
    by = c("topic2" = "topic")
  ) %>%
  mutate(
    other_topic_label = case_when(
      cat2 == "Other Disability" ~ disability_topic_label,
      cat2 == "General" ~ general_topic_label,
      TRUE ~ as.character(topic2)
    )
  ) %>%
  select(
    ld_topic_name, other_topic_name, ld_topic_id, other_topic_id,
    ld_topic_label, other_topic_label, cosine_similarity, other_category
  ) %>%
  mutate(
    other_category = factor(other_category, levels = c("Other Disability", "General"))
  )

topic_similarities <- topic_similarities %>%
  distinct(ld_topic_name, other_topic_name, other_category, .keep_all = TRUE)

topic_similarity_heatmap <- TextAnalysisR::plot_cross_category_heatmap(
  similarity_data = topic_similarities,
  row_var = "ld_topic_name",
  col_var = "other_topic_name",
  value_var = "cosine_similarity",
  category_var = "other_category",
  row_display_var = "ld_topic_label",
  col_display_var = "other_topic_label",
  row_label = "SLD Topics"
) +
  ggplot2::facet_wrap(~ col_category, scales = "free_x", nrow = 2) +
  ggplot2::scale_fill_viridis_c(name = "Cosine\nSimilarity", option = "plasma")

topic_similarity_heatmap_plotly <- topic_similarity_heatmap %>% 
  ggplotly(tooltip = "text", width = 800, height = 1000) %>% 
  layout(
    hovermode = "closest",
    margin = list(b = 70)  
  ) %>%
  style(hoverinfo = "text")

Topic-Level Similarity Heatmap

Code

topic_similarity_stats <- topic_similarities %>%
  group_by(other_category) %>%
  summarise(
    mean_similarity = round(mean(cosine_similarity), 3),
    median_similarity = round(median(cosine_similarity), 3),
    sd_similarity = round(sd(cosine_similarity), 3),
    min_similarity = round(min(cosine_similarity), 3),
    max_similarity = round(max(cosine_similarity), 3),
    .groups = 'drop'
  ) %>%
  arrange(other_category)

topic_similarity_threshold <- 0.6

topic_gap_analysis <- TextAnalysisR::analyze_similarity_gaps(
    similarity_data = topic_similarities,
    ref_var = "ld_topic_name",
    other_var = "other_topic_name",
    similarity_var = "cosine_similarity",
    category_var = "other_category",
    ref_label_var = "ld_topic_label",
    other_label_var = "other_topic_label",
    unique_threshold = topic_similarity_threshold,
    cross_policy_min = topic_similarity_threshold,
    cross_policy_max = 0.8
)

LD_Unique_topics <- topic_gap_analysis$unique_items %>%
    rename(match_topic = best_match, match_cat = best_match_category) %>%
    mutate(
        topic_numeric_id = as.numeric(str_extract(ld_topic_name, "\\d+")),
        gap_type = "LD_Unique_Strength"
    ) %>%
    arrange(topic_numeric_id)

missing_from_ld_topics <- topic_gap_analysis$missing_items %>%
    rename(match_sld = best_match) %>%
    mutate(
        topic_numeric_id = as.numeric(str_extract(other_topic_name, "\\d+")),
        gap_type = "Missing_from_LD"
    ) %>%
    arrange(topic_numeric_id)

cross_policy_topics <- topic_gap_analysis$cross_policy %>%
    rename(match_topic = best_match, match_cat = best_match_category) %>%
    mutate(
        topic_numeric_id = as.numeric(str_extract(ld_topic_name, "\\d+")),
        gap_type = "Cross-Policy_Learning"
    ) %>%
    arrange(topic_numeric_id)

Topic-Level Similarity Statistics Table

Unique Topics from SLD Policies

Lower similarity scores indicate greater uniqueness

Code

LD_Unique_topics %>%
    select(-gap_type, -match_topic, -topic_numeric_id) %>%
    DT::datatable(
        options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'pdf'),
            pageLength = 10,
            scrollX = TRUE
        ),
        extensions = 'Buttons',
        class = "cell-border stripe",
        rownames = FALSE
    ) %>% 
    DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3)

Missing Topics from LD Policies

Lower similarity scores indicate content missing from SLD policies

Code

missing_from_ld_topics %>%
    select(-gap_type, -topic_numeric_id) %>%
    DT::datatable(
        options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'pdf'),
            pageLength = 10,
            scrollX = TRUE
        ),
        extensions = 'Buttons',
        class = "cell-border stripe",
        rownames = FALSE
    ) %>%
    DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3)

Cross-Policy Learning Opportunities

Similarity scores between 0.6-0.8 indicate cross-policy learning potential

Code

cross_policy_topics %>%
    select(-gap_type, -match_topic, -topic_numeric_id) %>%
    DT::datatable(
        options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'pdf'),
            pageLength = 10,
            scrollX = TRUE
        ),
        extensions = 'Buttons',
        class = "cell-border stripe",
        rownames = FALSE
    ) %>%
    DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3)

Participants

Code

# single Choice Variables
single_tbl <- delphiFirst %>%
  select(spanish_hispanic_latino, sex) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Choice") %>%
  count(Variable, Choice) %>%
  mutate(
    Percent = round(n / nrow(delphiFirst) * 100, 1),
    `N (%)` = paste0(n, " (", Percent, "%)"),
    Variable = case_when(
      Variable == "spanish_hispanic_latino" ~ "Spanish/Hispanic/Latino",
      Variable == "sex" ~ "Sex",
      TRUE ~ Variable
    )
  ) %>%
  select(Variable, Choice, `N (%)`)

openxlsx::write.xlsx(single_tbl, "data/single_tbl.xlsx")

Code

# Multiple Choice Variables 
total_n <- nrow(delphiFirst)

summarize_multichoice <- function(data, var, var_label) {
  data %>%
    pull({{ var }}) %>%
    str_split(",\\s*") %>%
    unlist() %>%
    str_trim() %>%
    table() %>%
    sort(decreasing = TRUE) %>%
    as.data.frame() %>%
    rename(Choice = ".", Count = "Freq") %>%
    mutate(
      Percent = round(Count / total_n * 100, 1),
      `N (%)` = paste0(Count, " (", Percent, "%)"),
      Variable = var_label
    ) %>%
    select(Variable, Choice, `N (%)`)
}

position_tbl <- summarize_multichoice(delphiFirst, position, "Position")
school_tbl   <- summarize_multichoice(delphiFirst, school_level, "School Level") 
race_tbl     <- summarize_multichoice(delphiFirst, race, "Race")

multi_tbl <- bind_rows(position_tbl, school_tbl, race_tbl)

openxlsx::write.xlsx(multi_tbl, "data/multi_tbl.xlsx")

Word Frequency

Code

setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
tokens <- TextAnalysisR::prep_texts(united_tbl, text_field = "united_texts", verbose = FALSE)

tokens_collocations <- TextAnalysisR::detect_multi_words(tokens, size = 2:5, min_count = 2)
tokens_dict <- quanteda::dictionary(
  list(
    custom = c(
      "access to",
      "ai systems",
      "artificial intelligence",
      "assistive technology",
      "automated systems",
      "best practices",
      "civil rights",
      "decision making",
      "developers should",
      "educational settings",
      "federal government",
      "intelligence systems",
      "learning disability",
      "learning disabilities",
      "machine learning",
      "safety institute",
      "school district",
      "students with disabilities",
      "virtual environments"
    )
  )
)

compound_tokens <- function(tokens, dict) {
  quanteda::tokens_compound(
    tokens,
    pattern = dict,
    concatenator = "_",
    valuetype = "glob",
    window = 0,
    case_insensitive = TRUE,
    join = TRUE,
    keep_unigrams = FALSE,
    verbose = TRUE
  )
}

tokens_compound <- compound_tokens(tokens, tokens_dict)
dfm_object <- dfm(tokens_compound)

word_frequency_plot <- TextAnalysisR::plot_word_frequency(dfm_object, n = 20)

Remove Predefined Stopwords

Code

# setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
toks_removed <- quanteda::tokens_remove(tokens_compound, pattern = stopwords::stopwords("en", source = "snowball"), verbose = FALSE)
dfm_init <- dfm(toks_removed)
word_frequency_remove_stopwords <- TextAnalysisR::plot_word_frequency(dfm_init, n = 20)

Remove Common Words in the Dataset

Code

# setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
common_words <- c("na", "may", "can", "[''']s")
toks_removed_common <- quanteda::tokens_remove(toks_removed,
                                     pattern = common_words,
                                     valuetype = "regex",
                                     verbose = FALSE)
dfm_removed <- dfm(toks_removed_common)
word_frequency_remove_common <- TextAnalysisR::plot_word_frequency(dfm_removed, n = 20)

Lemmatize Tokens

Code

# setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
toks_lemmatized <- TextAnalysisR::lemmatize_tokens(toks_removed_common, batch_size = 50)
dfm <- dfm(toks_lemmatized)
word_frequency_lemmatized <- TextAnalysisR::plot_word_frequency(dfm, n = 20)

Model Diagnostics (Optimal Topic Numbers)

Code

# setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
toks_lemmatized <- TextAnalysisR::lemmatize_tokens(toks_removed_common, batch_size = 50)
dfm <- dfm(toks_lemmatized)
word_frequency_lemmatized <- TextAnalysisR::plot_word_frequency(dfm, n = 20)

optimal_k_result <- TextAnalysisR::find_optimal_k(
    dfm_object = dfm,
    topic_range = 10:45,
    max.em.its = 50,
    categorical_var = NULL,
    continuous_var = NULL,
    height = 600,
    width = 800,
    verbose = FALSE)

Structural Topic Modeling

Code

out <- quanteda::convert(dfm, to = "stm")

topic_model <- stm(
  data = out$meta,
  documents = out$documents,
  vocab = out$vocab,
  max.em.its = 75,
  init.type = "Spectral",
  K = 35,
  prevalence = ~ c("disability", "SLD"), 
  verbose = FALSE,
  seed = 1234)

top_topic_terms <- TextAnalysisR::get_topic_terms(
  stm_model = topic_model,
  top_term_n = 10,
  verbose = FALSE
)

top_topic_terms %>%
    mutate_if(is.numeric, ~ round(., 3)) %>% 
    DT::datatable(
        options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'pdf'),
            pageLength = 10,
            scrollX = TRUE
        ),
        extensions = 'Buttons',
        class = "cell-border stripe",
        rownames = FALSE
    )

Generate Policy Topic Labels Using OpenAI’s API

top_labeled_topic_terms <- TextAnalysisR::generate_topic_labels(
  top_topic_terms,
  model = "gpt-3.5-turbo",
  temperature = 0.5,
  verbose = FALSE)

Probability of Words Observed in Each Topic (Beta)

word_probability_ggplot <- TextAnalysisR::plot_word_probability(
  top_labeled_topic_terms,
  topic_label = "topic_label",
  ncol = 2
) +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(size = 12),
    axis.text.y = ggplot2::element_text(size = 12),
    axis.title.x = ggplot2::element_text(size = 12),
    axis.title.y = ggplot2::element_text(size = 12),
    legend.position = "right",
    legend.text = ggplot2::element_text(size = 12)
  )

word_probability_plot <- plotly::ggplotly(word_probability_ggplot, height = 5000, width = 1500)

word_probability_plot$x$layout$annotations <- lapply(
  word_probability_plot$x$layout$annotations,
  \(x) { x$font$size <- 16; x }
)
word_probability_plotly <- word_probability_plot |>
  plotly::layout(margin = list(t = 50, b = 50, l = 80, r = 20))

Generate Policy Survey Items Using OpenAI’s API

Code

top_labeled_human_loop <- read_excel("data/top_labeled_topic_terms_human_loop.xlsx")

# Custom prompts for educational policy survey items
policy_system_prompt <- "
You are a highly skilled survey designer specializing in creating Likert-scale survey items for educational policy research. Your task is to generate clear, concise survey items that assess the importance of various AI policy aspects for students with learning disabilities.

Guidelines for generating survey items:

1. Format and Structure
   - Create statements that can be rated on a 5-point Likert scale (1=Strongly Disagree to 5=Strongly Agree)
   - Keep items clear, concise, and focused on a single concept
   - Use active voice and present tense
   - Avoid double-barreled questions or complex statements

2. Content Focus
   - Focus on the importance or necessity of the policy aspect
   - Use person-first language (e.g., 'students with learning disabilities' instead of 'disabled students')
   - Ensure items are specific and actionable
   - Avoid jargon unless necessary for technical accuracy

3. Clarity and Precision
   - Use simple, direct language
   - Avoid ambiguous terms or complex sentence structures
   - Make sure each item measures exactly one concept
   - Ensure items are relevant to educational settings

4. Response Scale Considerations
   - Frame items to work well with a 5-point Likert scale
   - Avoid extreme language that might force responses to one end of the scale
   - Ensure items can be meaningfully rated on the agree-disagree continuum

Example
----------
Top Terms (highest to lowest beta score):
virtual manipulatives (.035)
manipulatives (.022)
mathematical (.014)
app (.013)
solving (.013)
learning disability (.012)
algebra (.012)
area (.011)
tool (.010)
concrete manipulatives (.010)

Generated Survey Item:
AI-powered visual manipulatives should be made available to support mathematical problem-solving for students with learning disabilities.

Focus on creating clear, measurable items that capture the essence of each topic while following these guidelines.
"

policy_user_template <- "You have a topic with keywords listed from most to least significant: {terms}

Please create a single survey item that:
1. Captures the essence of this topic
2. Can be rated on a 5-point Likert scale (1=Strongly Disagree to 5=Strongly Agree)
3. Focuses on the importance of this policy aspect
4. Uses person-first language
5. Is clear, concise, and specific

Survey item:"

# Generate survey items using TextAnalysisR
policy_survey_items <- TextAnalysisR::generate_topic_content(
    topic_terms_df = top_labeled_human_loop,
    content_type = "survey_item",
    topic_var = "topic",
    term_var = "term",
    weight_var = "beta",
    provider = "openai",
    model = "gpt-3.5-turbo",
    temperature = 0,
    system_prompt = policy_system_prompt,
    user_prompt_template = policy_user_template,
    max_tokens = 50,
    verbose = TRUE
)

Round 1 Delphi Survey Results Table

Decision Rule Based on Experts’ Consensus and Content Validity Ratio (CVR):
- 75% or more of experts agree (Rating ≥ 4) & CVR >= 0.60 → Retain
- 70% or more of experts agree (Rating ≥ 4) & CVR >= 0.60 → Consider
- 75% or more of experts agree (Rating ≥ 4) & CVR < 0.60 → Consider
- Otherwise → Revise/Remove

Code

delphiFirst <- read_excel("data/delphiFirst.xlsx")

likert <- c(
  "Strongly Disagree" = 1,
  "Disagree" = 2,
  "Neutral" = 3,
  "Agree" = 4,
  "Strongly Agree" = 5
)

question_levels <- sort(as.numeric(gsub("Q", "", unique(delphiFirst %>% 
  select(matches("^Q[0-9]+$")) %>% 
  names()))))

question_levels <- paste0("Q", question_levels)

delphi_long <- delphiFirst %>%
  pivot_longer(
    cols = -c(
      ID,
      position:zip_code,
      starts_with("Q33_"),
      "Q34",
      "state",
      "census_region"
    ),
    names_to = "Question",
    values_to = "Rating"
  ) %>%
  mutate(
    Rating = likert[Rating],
    Question = factor(Question, levels = question_levels)
  )

round_1_results <- delphi_long %>%
  group_by(Question) %>%
  summarise(
    Total = n(),
    Strongly_Disagree = sum(Rating == 1),
    Disagree = sum(Rating == 2),
    Neutral = sum(Rating == 3),
    Agree = sum(Rating == 4),
    Strongly_Agree = sum(Rating == 5),
    Agreement_Count = sum(Rating >= 4),
    Percent_Agreement = round((Agreement_Count / Total) * 100, 1),
    Mean = round(mean(Rating), 2),
    SD = round(sd(Rating), 2),
    Essential = sum(Rating >= 4),
    CVR = round((Essential - (Total / 2)) / (Total / 2), 3),
    IQR = IQR(Rating),
    Consensus_Decision = case_when(
      Percent_Agreement >= 75 ~ "Retain",
      Percent_Agreement >= 70 ~ "Consider",
      TRUE ~ "Revise/Remove"
    ),
    CVR_Decision = ifelse(CVR >= 0.60, "Retain", "Revise/Remove"),
    Final_Decision = case_when(
      Percent_Agreement >= 75 & CVR >= 0.60 ~ "Retain",
      Percent_Agreement >= 70 & CVR >= 0.60 ~ "Consider",
      Percent_Agreement >= 75 & CVR < 0.60 ~ "Consider",
      TRUE ~ "Revise/Remove"
    ),
    .groups = 'drop'
  ) %>%
  ungroup() %>%
  mutate(
    Question = factor(
      Question,
      levels = Question[order(as.numeric(gsub("Q", "", Question)))]
    )
  )

# openxlsx::write.xlsx(round_1_results, "results/round_1_results.xlsx", rowNames = FALSE)

round_1_results_label <- read_excel("results/round_1_results_label.xlsx")

Round 1 Delphi Survey Results Plot

Code

round_1_results <- round_1_results %>%
  mutate(Final_Decision = factor(Final_Decision, levels = c("Retain", "Consider", "Revise/Remove")))

round_1_results <- round_1_results %>%
  mutate(
    tooltip_label = paste0(
      "Agreement: ", Percent_Agreement, "%\n",
      "CVR: ", round(CVR, 2), "\n",
      "Final Decision: ", Final_Decision
    )
  )

round_1_plot <- ggplot(round_1_results, aes(
  x = reorder(Question, Percent_Agreement),
  y = Percent_Agreement,
  fill = Final_Decision,
  text = tooltip_label
)) +
  geom_col() +
  geom_hline(yintercept = 75, linetype = "dashed", color = "blue") +
  coord_flip() +
  labs(
    title = "",
    subtitle = "",
    x = "Question",
    y = "Percent Agreement",
    fill = "Final Decision"
  ) +
  scale_fill_manual(values = c(
    "Retain" = "darkgreen",
    "Consider" = "#568203",
    "Revise/Remove" = "orange"
  )) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x      = element_text(size = 12),
    axis.text.y      = element_text(size = 12),
    axis.title.x     = element_text(size = 12),
    axis.title.y     = element_text(size = 12),
    legend.position  = "right",
    legend.text      = element_text(size = 12)
  )

round_1_plotly <- plotly::ggplotly(round_1_plot, width = 800, height = 700, tooltip = "text") %>%
  plotly::layout(
    annotations = list(
      list(
        x = 0,
        y = -0.16,
        text = "Blue line = 75% agreement threshold (Delphi consensus)",
        showarrow = FALSE,
        xref = "paper",
        yref = "paper",
        font = list(size = 16, color = "gray40")
      )
    ),
    margin = list(b = 100, t = 40, l = 40)
  )

Round 1 Response Distribution by Question

Code

round_1_dist <- delphi_long %>%
  mutate(Response_Label = case_when(
    Rating == 1 ~ "Strongly Disagree",
    Rating == 2 ~ "Disagree", 
    Rating == 3 ~ "Neutral",
    Rating == 4 ~ "Agree",
    Rating == 5 ~ "Strongly Agree"
  )) %>%
  count(Question, Response_Label) %>%
  group_by(Question) %>%
  mutate(
    Percentage = round(n / sum(n) * 100, 1),
    Rating_Count = ifelse(Response_Label == "Strongly Agree", n, 0),
    Max_Rating = max(Rating_Count)
  )

round_1_dist <- round_1_dist %>%
  mutate(Response_Label = factor(Response_Label, levels = c("Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree")))

round_1_dist <- round_1_dist %>%
  left_join(
    round_1_results %>% select(Question, Percent_Agreement),
    by = "Question"
  )

round_1_dist_plot <- ggplot(
    round_1_dist,
    aes(x = reorder(Question, Percent_Agreement), y = n, fill = Response_Label)
  ) +
  geom_col(position = "stack") +
  coord_flip() +
  labs(
    title = "Distribution of Responses by Question",
    x = "Question", 
    y = "Number of Responses",
    fill = "Rating"
  ) +
  scale_fill_brewer(type = "div", palette = "RdYlGn") +
  guides(fill = guide_legend(reverse = FALSE)) + 
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "gray90"),
    panel.grid.major.y = element_line(color = "gray90"),
    axis.text.x        = element_text(size = 12),
    axis.text.y        = element_text(size = 12),
    axis.title.x       = element_text(size = 12),
    axis.title.y       = element_text(size = 12),
    legend.position    = "right",
    legend.text        = element_text(size = 12)
  )

round_1_dist_plotly <- plotly::ggplotly(round_1_dist_plot, width = 800, height = 700)

Round 2 Delphi Survey Item Development

Code

# setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE)
common_words_round2 <- c("also")

united_tbl_round2 <- TextAnalysisR::unite_cols(delphiFirst, listed_vars = "Q34")

tokens_round2 <- TextAnalysisR::prep_texts(
  united_tbl_round2,
  text_field = "united_texts",
  remove_stopwords = TRUE,
  custom_stopwords = common_words_round2,
  custom_valuetype = "regex",
  verbose = FALSE
)

tokens_collocations_round2 <- TextAnalysisR::detect_multi_words(tokens_round2, size = 2:5, min_count = 2)

dfm_object_round2 <- dfm(tokens_round2)

toks_lemmatized_round2 <- TextAnalysisR::lemmatize_tokens(tokens_round2, batch_size = 50)

dfm_round2 <- dfm(toks_lemmatized_round2)

word_frequency_lemmatized_round2 <- TextAnalysisR::plot_word_frequency(dfm_round2, n = 20)

optimal_k_result_round2 <- TextAnalysisR::find_optimal_k(
    dfm_object = dfm_round2,
    topic_range = 5:20,
    max.em.its = 75,
    categorical_var = NULL,
    continuous_var = "",
    height = 600,
    width = 800,
    verbose = FALSE)

Code

out_round2 <- quanteda::convert(dfm_round2, to = "stm")

topic_model_round_1 <- stm(
    data = out_round2$meta,
    documents = out_round2$documents,
    vocab = out_round2$vocab,
    max.em.its = 75,
    init.type = "Spectral",
    K = 11,
    prevalence = ~ 1, 
    verbose = FALSE,
    seed = 1234)

top_topic_terms_round_1 <- TextAnalysisR::get_topic_terms(
    stm_model = topic_model_round_1,
    top_term_n = 10,
    verbose = FALSE
)

top_labeled_topic_terms_round_1 <- TextAnalysisR::generate_topic_labels(
    top_topic_terms_round_1 ,
    model = "gpt-3.5-turbo",
    temperature = 0.5,
    verbose = FALSE)

openxlsx::write.xlsx(top_labeled_topic_terms_round_1, "data/top_labeled_topic_terms_round_1.xlsx")

Code

# Generate survey items for Round 1 using TextAnalysisR
policy_survey_items_round_1 <- TextAnalysisR::generate_topic_content(
    topic_terms_df = top_labeled_topic_terms_round_1,
    content_type = "survey_item",
    topic_var = "topic",
    term_var = "term",
    weight_var = "beta",
    provider = "openai",
    model = "gpt-3.5-turbo",
    temperature = 0,
    system_prompt = policy_system_prompt,
    user_prompt_template = policy_user_template,
    max_tokens = 50,
    verbose = TRUE
)

policy_survey_items_round_1_td <- 
    policy_survey_items_round_1 %>%
    group_by(topic) %>%
    summarise(
        survey_item = first(survey_item),
        term_beta = paste(term, "(", round(beta, 3), ")", collapse = ", ")
    ) %>%
    ungroup()

openxlsx::write.xlsx(policy_survey_items_round_1_td, "data/policy_survey_items_round_1_td.xlsx")

Round 2 Delphi Survey Results Table

Code

delphiSecond <- read_excel("data/delphiSecond.xlsx")

likert <- c(
  "Strongly Disagree" = 1,
  "Disagree" = 2,
  "Neutral" = 3,
  "Agree" = 4,
  "Strongly Agree" = 5
)

question_levels <- sort(as.numeric(gsub("Q", "", unique(delphiSecond %>% 
  select(matches("^Q[0-9]+$")) %>% 
  names()))))

question_levels <- paste0("Q", question_levels)

delphi_long_2 <- delphiSecond %>%
  pivot_longer(
    cols = -c(
      ID,
      starts_with("Q15_"),
      "Q16"
    ),
    names_to = "Question",
    values_to = "Rating"
  ) %>%
  mutate(
    Rating = likert[Rating],
    Question = factor(Question, levels = question_levels)
  )

round_2_results <- delphi_long_2 %>%
  group_by(Question) %>%
  summarise(
    Total = n(),
    Strongly_Disagree = sum(Rating == 1),
    Disagree = sum(Rating == 2),
    Neutral = sum(Rating == 3),
    Agree = sum(Rating == 4),
    Strongly_Agree = sum(Rating == 5),
    Agreement_Count = sum(Rating >= 4),
    Percent_Agreement = round((Agreement_Count / Total) * 100, 1),
    Mean = round(mean(Rating), 2),
    SD = round(sd(Rating), 2),
    Essential = sum(Rating >= 4),
    CVR = round((Essential - (Total / 2)) / (Total / 2), 3),
    IQR = IQR(Rating),
    Consensus_Decision = case_when(
      Percent_Agreement >= 75 ~ "Retain",
      Percent_Agreement >= 70 ~ "Consider",
      TRUE ~ "Revise/Remove"
    ),
    CVR_Decision = ifelse(CVR >= 0.60, "Retain", "Revise/Remove"),
    Final_Decision = case_when(
      Percent_Agreement >= 75 & CVR >= 0.60 ~ "Retain",
      Percent_Agreement >= 70 & CVR >= 0.60 ~ "Consider",
      Percent_Agreement >= 75 & CVR < 0.60 ~ "Consider",
      TRUE ~ "Revise/Remove"
    ),
    .groups = 'drop'
  ) %>%
  ungroup() %>%
  mutate(
    Question = factor(
      Question,
      levels = Question[order(as.numeric(gsub("Q", "", Question)))]
    )
  )

# openxlsx::write.xlsx(round_2_results, "results/round_2_results.xlsx", rowNames = FALSE)

round_2_results_label <- read_excel("results/round_2_results_label.xlsx")

Round 2 Delphi Survey Results Plot

Code

round_2_results <- round_2_results %>%
  mutate(Final_Decision = factor(Final_Decision, levels = c("Retain", "Consider", "Revise/Remove")))

round_2_results <- round_2_results %>%
  mutate(
    tooltip_label = paste0(
      "Agreement: ", Percent_Agreement, "%\n",
      "CVR: ", round(CVR, 2), "\n",
      "Final Decision: ", Final_Decision
    )
  )

round_2_plot <- ggplot(round_2_results, aes(
  x = reorder(Question, Percent_Agreement),
  y = Percent_Agreement,
  fill = Final_Decision,
  text = tooltip_label
)) +
  geom_col() +
  geom_hline(yintercept = 75, linetype = "dashed", color = "blue") +
  coord_flip() +
  labs(
    title = "",
    subtitle = "",
    x = "Question",
    y = "Percent Agreement",
    fill = "Final Decision"
  ) +
  scale_fill_manual(values = c(
    "Retain" = "darkgreen",
    "Consider" = "#568203",
    "Revise/Remove" = "orange"
  )) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x      = element_text(size = 12),
    axis.text.y      = element_text(size = 12),
    axis.title.x     = element_text(size = 12),
    axis.title.y     = element_text(size = 12),
    legend.position  = "right",
    legend.text      = element_text(size = 12)
  )

round_2_plotly <- plotly::ggplotly(round_2_plot, width = 800, height = 600, tooltip = "text") %>%
  plotly::layout(
    annotations = list(
      list(
        x = 0,
        y = -0.22,
        text = "Blue line = 75% agreement threshold (Delphi consensus)",
        showarrow = FALSE,
        xref = "paper",
        yref = "paper",
        font = list(size = 16, color = "gray40")
      )
    ),
    margin = list(b = 120, t = 40, l = 40)
  )

Round 2 Response Distribution by Question

Code

round_2_dist <- delphi_long_2 %>%
  mutate(Response_Label = case_when(
    Rating == 1 ~ "Strongly Disagree",
    Rating == 2 ~ "Disagree", 
    Rating == 3 ~ "Neutral",
    Rating == 4 ~ "Agree",
    Rating == 5 ~ "Strongly Agree"
  )) %>%
  count(Question, Response_Label) %>%
  group_by(Question) %>%
  mutate(
    Percentage = round(n / sum(n) * 100, 1),
    Rating_Count = ifelse(Response_Label == "Strongly Agree", n, 0),
    Max_Rating = max(Rating_Count)
  )

round_2_dist <- round_2_dist %>%
  mutate(Response_Label = factor(Response_Label, levels = c("Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree")))

round_2_dist <- round_2_dist %>%
  left_join(
    round_2_results %>% select(Question, Percent_Agreement),
    by = "Question"
  )

round_2_dist_plot <- ggplot(
    round_2_dist,
    aes(x = reorder(Question, Percent_Agreement), y = n, fill = Response_Label)
  ) +
  geom_col(position = "stack") +
  coord_flip() +
  labs(
    title = "Distribution of Responses by Question",
    x = "Question", 
    y = "Number of Responses",
    fill = "Rating"
  ) +
  scale_fill_brewer(type = "div", palette = "RdYlGn") +
  guides(fill = guide_legend(reverse = FALSE)) + 
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "gray90"),
    panel.grid.major.y = element_line(color = "gray90"),
    axis.text.x        = element_text(size = 12),
    axis.text.y        = element_text(size = 12),
    axis.title.x       = element_text(size = 12),
    axis.title.y       = element_text(size = 12),
    legend.position    = "right",
    legend.text        = element_text(size = 12)
  )

round_2_dist_plotly <- plotly::ggplotly(round_2_dist_plot, width = 800, height = 600)

Round 2 Delphi Survey Ranking of Policy Items

Code

ranking_long <- delphiSecond %>%
    pivot_longer(
        cols = starts_with("Q15_"),
        names_to = "rank_position", 
        values_to = "item_selected",
        names_pattern = "Q15_(\\d+)"
    ) %>%
    mutate(rank_position = as.numeric(rank_position)) %>%
    filter(!is.na(item_selected))

item_percentages <- ranking_long %>%
    group_by(item_selected) %>%
    summarise(
        Times_Selected = n(),
        Percentage = round((Times_Selected / 14) * 100, 1),
        .groups = 'drop'
    ) %>%
    arrange(desc(Percentage))

--- title: "Addressing the void of AI policies in education for students with specific learning disabilities" date: "`r Sys.Date()`" format: html: fontawesome: true code-fold: true code-tools: true toc: true --- The website contains supplemental materials and code used to analyze text data in Shin et al. (2025). Shin, M., Deniz, F., Watson, L., Dieterich, C., Ewoldt, K., Johnson, F., Kong, J., Lee, S. H., & Whitehurst, A. (in press). Addressing the void of AI policies in education for students with specific learning disabilities. *Learning Disability Quarterly*. ```{r setup, include=FALSE} setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) knitr::opts_chunk$set(echo = TRUE, warning = FALSE) ``` ::: panel-tabset ## Flowchart #### Flowchart of Data Collection and Analysis ```{mermaid} %%{init: { "flowchart": {"rankSpacing": 24}, "themeVariables": {"fontSize": "17px", "lineHeight": 1.1, "nodePadding": 6} }}%% flowchart TD n1["Policy Documents Analysis N = 12 Documents"] --> D["Document Categorization"] D --> E["Text Preprocessing"] E --> F["Text Embeddings Sentence-BERT"] & G["Topic Modeling Structural Topic Model"] F --> H["Similarity Analysis Cosine Similarity"] G --> I["Topic Label Generation OpenAI's API"] I --> V1["Research Team Validation Topic Labels"] F --> V1 H --> P["Theme Comparison • SLD vs Other Disability • SLD vs General Policies"] P --> Q["Gap Identification"] n1 --> TP2["Text Preprocessing"] TP2 --> TM2["Topic Modeling Structural Topic Model"] TM2 --> J["Survey Item Generation OpenAI's API"] J --> V3["Research Team Validation Survey Items"] V3 --> B["Delphi Round 1 N=17 Experts"] B --> C["Delphi Round 2 N=14 Experts"] & K1["Expert Validation After Delphi Round 1"] C --> K2["Expert Validation After Delphi Round 2"] K1 --> L["Content Validity Lawshe's CVR Method"] K2 --> L L --> STM3["Topic Modeling Structural Topic Model"] STM3 --> M["Final Policy Items N=36 Items"] M --> T["Theme Generation OpenAI's ChatGPT"] T --> V2["Research Team Validation Theme Labels"] classDef dataPhase fill:#f8f9ff,stroke:#4a5568,stroke-width:2px,color:#2d3748 classDef processPhase fill:#f7fdf7,stroke:#38a169,stroke-width:2px,color:#2d3748 classDef analysisPhase fill:#fffaf0,stroke:#ed8936,stroke-width:2px,color:#2d3748 classDef validationPhase fill:#fef5f7,stroke:#d53f8c,stroke-width:2px,color:#2d3748 classDef researchValidation fill:#f0f4ff,stroke:#6366f1,stroke-width:2px,color:#2d3748 classDef finalPhase fill:#e6fffa,stroke:#319795,stroke-width:2px,color:#2d3748 n1:::processPhase D:::processPhase B:::dataPhase C:::dataPhase K1:::validationPhase E:::processPhase F:::analysisPhase G:::analysisPhase H:::analysisPhase I:::analysisPhase T:::analysisPhase TP2:::processPhase TM2:::analysisPhase V1:::researchValidation V2:::researchValidation V3:::researchValidation P:::analysisPhase Q:::analysisPhase J:::analysisPhase K2:::validationPhase L:::validationPhase STM3:::analysisPhase M:::finalPhase ``` ## Set Up #### Load R Packages ```{r} #| code-fold: false # Reset any time limits from previous session state setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) suppressPackageStartupMessages({ library(tidyr) library(dplyr) library(readxl) library(gtsummary) library(officer) library(flextable) library(ggplot2) library(quanteda) library(reticulate) library(TextAnalysisR) library(htmlwidgets) library(stm) library(dotenv) library(ggdendro) library(plotly) library(httr) library(jsonlite) library(progress) library(openxlsx) library(stringr) library(purrr) library(text2vec) library(stringdist) library(DT) }) Sys.setenv(HF_HUB_DISABLE_SYMLINKS_WARNING = "1") Sys.setenv(TOKENIZERS_PARALLELISM = "false") options(timeout = 600) setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) ``` #### Load Dataset ```{r} #| code-fold: false load("data/ai_ld_data.rda") ``` ## Document Analysis ### Document-Level Analysis ```{r, eval=FALSE} data <- read_excel("data/policyData.xlsx") data_long <- data |> pivot_longer( cols = matches("^coding_\\d+$"), names_to = "coding_id", names_prefix = "coding_", values_to = "coding_text", values_drop_na = TRUE ) |> mutate( coding_id = as.integer(coding_id), document_id = paste(ID, coding_id, sep = "_") ) united_tbl<- TextAnalysisR::unite_cols(data_long, listed_vars = "coding_text") documents <- data_long %>% group_by(ID, doc_name, type, disability, SLD) %>% summarise( document_text = paste(coding_text, collapse = " "), n_chunks = n(), .groups = 'drop' ) %>% mutate( category = case_when( disability == "Yes" & SLD == "Yes" ~ "SLD", disability == "Yes" & SLD == "No" ~ "Other Disability", disability == "No" & SLD == "No" ~ "General", TRUE ~ "Uncategorized" ), display_name = paste("Doc", ID), doc_numeric_id = as.numeric(str_extract(display_name, "\\d+")) ) %>% arrange(doc_numeric_id) documents_filtered <- documents %>% mutate( doc_numeric_id = as.numeric(str_extract(display_name, "\\d+")) ) %>% arrange(doc_numeric_id) if(nrow(documents_filtered) == 0) { stop("No documents remain after filtering! Check document lengths and categories.") } document_summary <- documents_filtered %>% select( Document = display_name, `Document Name` = doc_name, Type = type, Disability = disability, SLD = SLD, Category = category, `Coding Units` = n_chunks ) ``` ```{r, echo=FALSE} document_summary %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` ```{r, eval=FALSE} # setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) documents <- data_long %>% group_by(ID, doc_name, type, disability, SLD) %>% summarise( document_text = paste(coding_text, collapse = " "), n_chunks = n(), .groups = 'drop' ) %>% mutate( category = case_when( disability == "Yes" & SLD == "Yes" ~ "SLD", disability == "Yes" & SLD == "No" ~ "Other Disability", disability == "No" & SLD == "No" ~ "General", TRUE ~ "Uncategorized" ), display_name = paste("Doc", ID), doc_numeric_id = as.numeric(str_extract(display_name, "\\d+")) ) %>% arrange(doc_numeric_id) if(nrow(documents) == 0) { stop("No documents remain after filtering! Check document lengths and categories.") } # Generate embeddings for all documents using TextAnalysisR all_embeddings <- TextAnalysisR::generate_embeddings( texts = documents$document_text, model = "all-MiniLM-L6-v2", verbose = TRUE ) # Calculate full similarity matrix using TextAnalysisR similarity_result <- TextAnalysisR::calculate_document_similarity( texts = documents$document_text, document_feature_type = "embeddings", similarity_method = "cosine", embedding_model = "all-MiniLM-L6-v2", verbose = TRUE ) full_sim_matrix <- similarity_result$similarity_matrix # Create document metadata for filtering doc_metadata <- documents %>% mutate(row_idx = row_number()) %>% select(row_idx, display_name, ID, doc_name, category) # Extract cross-category similarities (SLD vs Other categories) similarity_threshold <- 0.6 sld_indices <- doc_metadata %>% filter(category == "SLD") %>% pull(row_idx) other_disability_indices <- doc_metadata %>% filter(category == "Other Disability") %>% pull(row_idx) general_indices <- doc_metadata %>% filter(category == "General") %>% pull(row_idx) # Build ld_similarities by extracting from full matrix ld_similarities <- bind_rows( # SLD vs Other Disability expand_grid( ld_idx = sld_indices, other_idx = other_disability_indices ) %>% mutate( cosine_similarity = map2_dbl(ld_idx, other_idx, ~full_sim_matrix[.x, .y]), other_category = "Other Disability" ), # SLD vs General expand_grid( ld_idx = sld_indices, other_idx = general_indices ) %>% mutate( cosine_similarity = map2_dbl(ld_idx, other_idx, ~full_sim_matrix[.x, .y]), other_category = "General" ) ) %>% left_join(doc_metadata %>% select(row_idx, display_name, ID, doc_name) %>% rename(ld_doc_name = display_name, ld_doc_id = ID, ld_actual_name = doc_name), by = c("ld_idx" = "row_idx")) %>% left_join(doc_metadata %>% select(row_idx, display_name, ID, doc_name) %>% rename(other_doc_name = display_name, other_doc_id = ID, other_actual_name = doc_name), by = c("other_idx" = "row_idx")) %>% select(-ld_idx, -other_idx) %>% mutate( other_category = factor(other_category, levels = c("Other Disability", "General")) ) similarity_heatmap <- TextAnalysisR::plot_cross_category_heatmap( similarity_data = ld_similarities, row_var = "ld_doc_name", col_var = "other_doc_name", value_var = "cosine_similarity", category_var = "other_category", row_display_var = "ld_actual_name", col_display_var = "other_actual_name", row_label = "SLD Documents" ) similarity_heatmap_plotly <- similarity_heatmap %>% ggplotly(tooltip = "text", width = 800, height = 500) %>% layout(hovermode = "closest") %>% style(hoverinfo = "text") ``` ### Document-Level Similarity Heatmap ```{r, echo=FALSE} similarity_heatmap_plotly ``` #### Document Embedding Similarity Statistics Table ```{r, eval=FALSE} similarity_stats <- ld_similarities %>% group_by(other_category) %>% summarise( mean_similarity = round(mean(cosine_similarity), 3), median_similarity = round(median(cosine_similarity), 3), sd_similarity = round(sd(cosine_similarity), 3), min_similarity = round(min(cosine_similarity), 3), max_similarity = round(max(cosine_similarity), 3), .groups = 'drop' ) doc_gap_analysis <- TextAnalysisR::analyze_similarity_gaps( similarity_data = ld_similarities, ref_var = "ld_doc_name", other_var = "other_doc_name", similarity_var = "cosine_similarity", category_var = "other_category", unique_threshold = similarity_threshold, cross_policy_min = similarity_threshold, cross_policy_max = 0.8 ) LD_Unique <- doc_gap_analysis$unique_items %>% rename(match_doc = best_match, match_cat = best_match_category) %>% mutate( doc_numeric_id = as.numeric(str_extract(ld_doc_name, "\\d+")), gap_type = "LD_Unique" ) %>% arrange(doc_numeric_id) missing_from_ld <- doc_gap_analysis$missing_items %>% rename(match_sld = best_match) %>% mutate( doc_numeric_id = as.numeric(str_extract(other_doc_name, "\\d+")), gap_type = "Missing_from_SLD" ) %>% arrange(doc_numeric_id) cross_policy <- doc_gap_analysis$cross_policy %>% rename(match_doc = best_match, match_cat = best_match_category) %>% mutate( doc_numeric_id = as.numeric(str_extract(ld_doc_name, "\\d+")), gap_type = "Cross-Policy_Document" ) %>% arrange(doc_numeric_id) ``` ```{r, echo=FALSE} similarity_stats %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` #### Unique SLD Policy Documents *Lower similarity scores indicate greater uniqueness* ```{r, echo=FALSE} LD_Unique %>% select(ld_doc_name, min_similarity, max_similarity, match_doc, match_cat, gap_type) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3) ``` #### Missing from SLD Policy Documents *Lower similarity scores indicate content missing from SLD policies* ```{r, echo=FALSE} missing_from_ld %>% arrange(doc_numeric_id) %>% select(other_doc_name, min_similarity, max_similarity, match_sld, other_category, gap_type) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3) ``` #### Cross Policy *Similarity scores between 0.6-0.8 indicate cross-policy learning potential* ```{r} cross_policy %>% select(ld_doc_name, min_similarity, max_similarity, match_doc, match_cat, gap_type) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3) ``` ### Topic-Level Analysis ```{r, eval=FALSE} # setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) data <- data %>% mutate( category = case_when( disability == "Yes" & SLD == "Yes" ~ "SLD", disability == "Yes" & SLD == "No" ~ "Other Disability", disability == "No" & SLD == "No" ~ "General", TRUE ~ "Uncategorized" ) ) ld_docs <- data %>% filter(category == "SLD") %>% pivot_longer( cols = matches("^coding_\\d+$"), names_to = "coding_id", names_prefix = "coding_", values_to = "coding_text", values_drop_na = TRUE ) %>% mutate( coding_id = as.integer(coding_id), document_id = paste(ID, coding_id, sep = "_") ) disability_docs <- data %>% filter(category == "Other Disability") %>% pivot_longer( cols = matches("^coding_\\d+$"), names_to = "coding_id", names_prefix = "coding_", values_to = "coding_text", values_drop_na = TRUE ) %>% mutate( coding_id = as.integer(coding_id), document_id = paste(ID, coding_id, sep = "_") ) general_docs <- data %>% filter(category == "General") %>% pivot_longer( cols = matches("^coding_\\d+$"), names_to = "coding_id", names_prefix = "coding_", values_to = "coding_text", values_drop_na = TRUE ) %>% mutate( coding_id = as.integer(coding_id), document_id = paste(ID, coding_id, sep = "_") ) common_words <- c("na", "may", "can", "[''']s") united_tbl_ld <- TextAnalysisR::unite_cols(ld_docs, listed_vars = "coding_text") tokens_ld <- TextAnalysisR::prep_texts( united_tbl_ld, text_field = "united_texts", remove_stopwords = TRUE, custom_stopwords = common_words, custom_valuetype = "regex", verbose = FALSE ) toks_lemmatized_ld <- quanteda::tokens_wordstem(tokens_ld) dfm_ld <- dfm(toks_lemmatized_ld) united_tbl_disability <- TextAnalysisR::unite_cols(disability_docs, listed_vars = "coding_text") tokens_disability <- TextAnalysisR::prep_texts( united_tbl_disability, text_field = "united_texts", remove_stopwords = TRUE, custom_stopwords = common_words, custom_valuetype = "regex", verbose = FALSE ) toks_lemmatized_disability <- quanteda::tokens_wordstem(tokens_disability) dfm_disability <- dfm(toks_lemmatized_disability) united_tbl_general <- TextAnalysisR::unite_cols(general_docs, listed_vars = "coding_text") tokens_general <- TextAnalysisR::prep_texts( united_tbl_general, text_field = "united_texts", remove_stopwords = TRUE, custom_stopwords = common_words, custom_valuetype = "regex", verbose = FALSE ) toks_lemmatized_general <- quanteda::tokens_wordstem(tokens_general) dfm_general <- dfm(toks_lemmatized_general) out_ld <- quanteda::convert(dfm_ld, to = "stm") topic_model_ld <- stm( data = out_ld$meta, documents = out_ld$documents, vocab = out_ld$vocab, max.em.its = 75, init.type = "Spectral", K = 10, prevalence = ~ 1, verbose = FALSE, seed = 2025 ) out_disability <- quanteda::convert(dfm_disability, to = "stm") topic_model_disability <- stm( data = out_disability$meta, documents = out_disability$documents, vocab = out_disability$vocab, max.em.its = 75, init.type = "Spectral", K = 10, prevalence = ~ 1, verbose = FALSE, seed = 2025 ) out_general <- quanteda::convert(dfm_general, to = "stm") topic_model_general <- stm( data = out_general$meta, documents = out_general$documents, vocab = out_general$vocab, max.em.its = 75, init.type = "Spectral", K = 10, prevalence = ~ 1, verbose = FALSE, seed = 2025 ) top_terms_ld <- TextAnalysisR::get_topic_terms(topic_model_ld, top_term_n = 10, verbose = FALSE) top_terms_disability <- TextAnalysisR::get_topic_terms(topic_model_disability, top_term_n = 10, verbose = FALSE) top_terms_general <- TextAnalysisR::get_topic_terms(topic_model_general, top_term_n = 10, verbose = FALSE) top_labeled_ld <- TextAnalysisR::generate_topic_labels(top_terms_ld, model = "gpt-4o", temperature = 0.5, verbose = FALSE) top_labeled_disability <- TextAnalysisR::generate_topic_labels(top_terms_disability, model = "gpt-4o", temperature = 0.5, verbose = FALSE) top_labeled_general <- TextAnalysisR::generate_topic_labels(top_terms_general, model = "gpt-4o", temperature = 0.5, verbose = FALSE) top_labeled_ld$category <- "SLD" top_labeled_disability$category <- "Other Disability" top_labeled_general$category <- "General" top_labeled_ld <- top_labeled_ld %>% mutate(topic = as.numeric(topic)) %>% arrange(topic) top_labeled_disability <- top_labeled_disability %>% mutate(topic = as.numeric(topic)) %>% arrange(topic) top_labeled_general <- top_labeled_general %>% mutate(topic = as.numeric(topic)) %>% arrange(topic) gamma_ld <- TextAnalysisR::get_topic_prevalence(topic_model_ld, category = "SLD") gamma_disability <- TextAnalysisR::get_topic_prevalence(topic_model_disability, category = "Other Disability") gamma_general <- TextAnalysisR::get_topic_prevalence(topic_model_general, category = "General") all_gamma <- bind_rows(gamma_ld, gamma_disability, gamma_general) all_labeled_topics <- bind_rows(top_labeled_ld, top_labeled_disability, top_labeled_general) %>% left_join(all_gamma, by = c("topic", "category")) %>% mutate( topic_gamma = paste0("Topic ", topic, " (", round(gamma, 3), ")") ) %>% arrange(category, topic) all_labeled_topics <- all_labeled_topics %>% mutate( topic = as.numeric(topic), topic_label_with_gamma = paste0( topic, ". ", topic_label, " (", round(gamma, 3), ")" ) ) ld_topic_texts <- TextAnalysisR::get_topic_texts(top_terms_ld) disability_topic_texts <- TextAnalysisR::get_topic_texts(top_terms_disability) general_topic_texts <- TextAnalysisR::get_topic_texts(top_terms_general) # Generate topic embeddings using TextAnalysisR ld_topic_emb <- TextAnalysisR::generate_embeddings(ld_topic_texts, model = "all-MiniLM-L6-v2", verbose = FALSE) disability_topic_emb <- TextAnalysisR::generate_embeddings(disability_topic_texts, model = "all-MiniLM-L6-v2", verbose = FALSE) general_topic_emb <- TextAnalysisR::generate_embeddings(general_topic_texts, model = "all-MiniLM-L6-v2", verbose = FALSE) # Calculate cross-category topic similarity using TextAnalysisR ld_labels <- top_labeled_ld %>% distinct(topic, .keep_all = TRUE) %>% arrange(topic) %>% pull(topic_label) disability_labels <- top_labeled_disability %>% distinct(topic, .keep_all = TRUE) %>% arrange(topic) %>% pull(topic_label) general_labels <- top_labeled_general %>% distinct(topic, .keep_all = TRUE) %>% arrange(topic) %>% pull(topic_label) ld_vs_disability_result <- TextAnalysisR::calculate_cross_similarity( ld_topic_emb, disability_topic_emb, labels1 = ld_labels, labels2 = disability_labels ) ld_vs_disability_cosine <- ld_vs_disability_result$similarity_df %>% rename(topic1 = row_idx, topic2 = col_idx, cosine_similarity = similarity) %>% mutate(cat1 = "SLD", cat2 = "Other Disability", comparison = "SLD vs Other Disability") ld_vs_general_result <- TextAnalysisR::calculate_cross_similarity( ld_topic_emb, general_topic_emb, labels1 = ld_labels, labels2 = general_labels ) ld_vs_general_cosine <- ld_vs_general_result$similarity_df %>% rename(topic1 = row_idx, topic2 = col_idx, cosine_similarity = similarity) %>% mutate(cat1 = "SLD", cat2 = "General", comparison = "SLD vs General") ld_focused_similarities <- bind_rows( ld_vs_disability_cosine, ld_vs_general_cosine ) topic_similarities <- ld_focused_similarities %>% left_join( top_labeled_ld %>% arrange(topic) %>% select(topic, topic_label) %>% rename(ld_topic_label = topic_label), by = c("topic1" = "topic") ) %>% mutate( ld_topic_name = paste("Topic", topic1), other_topic_name = paste("Topic", topic2), ld_topic_id = label1, other_topic_id = label2, other_category = cat2 ) %>% left_join( top_labeled_disability %>% arrange(topic) %>% select(topic, topic_label) %>% rename(disability_topic_label = topic_label), by = c("topic2" = "topic") ) %>% left_join( top_labeled_general %>% arrange(topic) %>% select(topic, topic_label) %>% rename(general_topic_label = topic_label), by = c("topic2" = "topic") ) %>% mutate( other_topic_label = case_when( cat2 == "Other Disability" ~ disability_topic_label, cat2 == "General" ~ general_topic_label, TRUE ~ as.character(topic2) ) ) %>% select( ld_topic_name, other_topic_name, ld_topic_id, other_topic_id, ld_topic_label, other_topic_label, cosine_similarity, other_category ) %>% mutate( other_category = factor(other_category, levels = c("Other Disability", "General")) ) topic_similarities <- topic_similarities %>% distinct(ld_topic_name, other_topic_name, other_category, .keep_all = TRUE) topic_similarity_heatmap <- TextAnalysisR::plot_cross_category_heatmap( similarity_data = topic_similarities, row_var = "ld_topic_name", col_var = "other_topic_name", value_var = "cosine_similarity", category_var = "other_category", row_display_var = "ld_topic_label", col_display_var = "other_topic_label", row_label = "SLD Topics" ) + ggplot2::facet_wrap(~ col_category, scales = "free_x", nrow = 2) + ggplot2::scale_fill_viridis_c(name = "Cosine\nSimilarity", option = "plasma") topic_similarity_heatmap_plotly <- topic_similarity_heatmap %>% ggplotly(tooltip = "text", width = 800, height = 1000) %>% layout( hovermode = "closest", margin = list(b = 70) ) %>% style(hoverinfo = "text") ``` #### Topic-Level Similarity Heatmap ```{r, echo=FALSE} topic_similarity_heatmap_plotly ``` ```{r, eval=FALSE} topic_similarity_stats <- topic_similarities %>% group_by(other_category) %>% summarise( mean_similarity = round(mean(cosine_similarity), 3), median_similarity = round(median(cosine_similarity), 3), sd_similarity = round(sd(cosine_similarity), 3), min_similarity = round(min(cosine_similarity), 3), max_similarity = round(max(cosine_similarity), 3), .groups = 'drop' ) %>% arrange(other_category) topic_similarity_threshold <- 0.6 topic_gap_analysis <- TextAnalysisR::analyze_similarity_gaps( similarity_data = topic_similarities, ref_var = "ld_topic_name", other_var = "other_topic_name", similarity_var = "cosine_similarity", category_var = "other_category", ref_label_var = "ld_topic_label", other_label_var = "other_topic_label", unique_threshold = topic_similarity_threshold, cross_policy_min = topic_similarity_threshold, cross_policy_max = 0.8 ) LD_Unique_topics <- topic_gap_analysis$unique_items %>% rename(match_topic = best_match, match_cat = best_match_category) %>% mutate( topic_numeric_id = as.numeric(str_extract(ld_topic_name, "\\d+")), gap_type = "LD_Unique_Strength" ) %>% arrange(topic_numeric_id) missing_from_ld_topics <- topic_gap_analysis$missing_items %>% rename(match_sld = best_match) %>% mutate( topic_numeric_id = as.numeric(str_extract(other_topic_name, "\\d+")), gap_type = "Missing_from_LD" ) %>% arrange(topic_numeric_id) cross_policy_topics <- topic_gap_analysis$cross_policy %>% rename(match_topic = best_match, match_cat = best_match_category) %>% mutate( topic_numeric_id = as.numeric(str_extract(ld_topic_name, "\\d+")), gap_type = "Cross-Policy_Learning" ) %>% arrange(topic_numeric_id) ``` ```{r, echo=FALSE} all_labeled_topics %>% mutate(topics = topic_label_with_gamma, category = factor(category, levels = c("SLD", "Other Disability", "General"))) %>% select(-topic_label_with_gamma) %>% group_by(category, topics) %>% summarise( top_terms = paste(paste0(term, " (", round(beta, 3), ")"), collapse = ", "), .groups = 'drop' ) %>% arrange(category, as.numeric(gsub("^(\\d+).*", "\\1", topics))) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE, order = list() ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` #### Topic-Level Similarity Statistics Table ```{r, echo=FALSE} topic_similarity_stats %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` #### Unique Topics from SLD Policies *Lower similarity scores indicate greater uniqueness* ```{r} LD_Unique_topics %>% select(-gap_type, -match_topic, -topic_numeric_id) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3) ``` #### Missing Topics from LD Policies *Lower similarity scores indicate content missing from SLD policies* ```{r} missing_from_ld_topics %>% select(-gap_type, -topic_numeric_id) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3) ``` #### Cross-Policy Learning Opportunities *Similarity scores between 0.6-0.8 indicate cross-policy learning potential* ```{r} cross_policy_topics %>% select(-gap_type, -match_topic, -topic_numeric_id) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatRound(columns = c("min_similarity", "max_similarity"), digits = 3) ``` ## Round 1 #### Participants ```{r, echo=FALSE} delphiFirst <- read_excel("data/delphiFirst.xlsx") ``` ```{r, eval=FALSE} # single Choice Variables single_tbl <- delphiFirst %>% select(spanish_hispanic_latino, sex) %>% pivot_longer(everything(), names_to = "Variable", values_to = "Choice") %>% count(Variable, Choice) %>% mutate( Percent = round(n / nrow(delphiFirst) * 100, 1), `N (%)` = paste0(n, " (", Percent, "%)"), Variable = case_when( Variable == "spanish_hispanic_latino" ~ "Spanish/Hispanic/Latino", Variable == "sex" ~ "Sex", TRUE ~ Variable ) ) %>% select(Variable, Choice, `N (%)`) openxlsx::write.xlsx(single_tbl, "data/single_tbl.xlsx") ``` ```{r, echo=FALSE} single_tbl_ft <- read_excel("data/single_tbl_ft.xlsx") single_tbl_ft %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` ```{r, eval=FALSE} # Multiple Choice Variables total_n <- nrow(delphiFirst) summarize_multichoice <- function(data, var, var_label) { data %>% pull({{ var }}) %>% str_split(",\\s*") %>% unlist() %>% str_trim() %>% table() %>% sort(decreasing = TRUE) %>% as.data.frame() %>% rename(Choice = ".", Count = "Freq") %>% mutate( Percent = round(Count / total_n * 100, 1), `N (%)` = paste0(Count, " (", Percent, "%)"), Variable = var_label ) %>% select(Variable, Choice, `N (%)`) } position_tbl <- summarize_multichoice(delphiFirst, position, "Position") school_tbl <- summarize_multichoice(delphiFirst, school_level, "School Level") race_tbl <- summarize_multichoice(delphiFirst, race, "Race") multi_tbl <- bind_rows(position_tbl, school_tbl, race_tbl) openxlsx::write.xlsx(multi_tbl, "data/multi_tbl.xlsx") ``` ```{r, echo=FALSE} multi_tbl_ft <- read_excel("data/multi_tbl_ft.xlsx") multi_tbl_ft %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` #### Word Frequency ```{r, eval=FALSE} setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) tokens <- TextAnalysisR::prep_texts(united_tbl, text_field = "united_texts", verbose = FALSE) tokens_collocations <- TextAnalysisR::detect_multi_words(tokens, size = 2:5, min_count = 2) tokens_dict <- quanteda::dictionary( list( custom = c( "access to", "ai systems", "artificial intelligence", "assistive technology", "automated systems", "best practices", "civil rights", "decision making", "developers should", "educational settings", "federal government", "intelligence systems", "learning disability", "learning disabilities", "machine learning", "safety institute", "school district", "students with disabilities", "virtual environments" ) ) ) compound_tokens <- function(tokens, dict) { quanteda::tokens_compound( tokens, pattern = dict, concatenator = "_", valuetype = "glob", window = 0, case_insensitive = TRUE, join = TRUE, keep_unigrams = FALSE, verbose = TRUE ) } tokens_compound <- compound_tokens(tokens, tokens_dict) dfm_object <- dfm(tokens_compound) word_frequency_plot <- TextAnalysisR::plot_word_frequency(dfm_object, n = 20) ``` ```{r, echo=FALSE} word_frequency_plot ``` #### Remove Predefined Stopwords ```{r, eval=FALSE} # setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) toks_removed <- quanteda::tokens_remove(tokens_compound, pattern = stopwords::stopwords("en", source = "snowball"), verbose = FALSE) dfm_init <- dfm(toks_removed) word_frequency_remove_stopwords <- TextAnalysisR::plot_word_frequency(dfm_init, n = 20) ``` ```{r, echo=FALSE} word_frequency_remove_stopwords ``` #### Remove Common Words in the Dataset ```{r, eval=FALSE} # setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) common_words <- c("na", "may", "can", "[''']s") toks_removed_common <- quanteda::tokens_remove(toks_removed, pattern = common_words, valuetype = "regex", verbose = FALSE) dfm_removed <- dfm(toks_removed_common) word_frequency_remove_common <- TextAnalysisR::plot_word_frequency(dfm_removed, n = 20) ``` ```{r, echo=FALSE} word_frequency_remove_common ``` #### Lemmatize Tokens ```{r, eval=FALSE} # setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) toks_lemmatized <- TextAnalysisR::lemmatize_tokens(toks_removed_common, batch_size = 50) dfm <- dfm(toks_lemmatized) word_frequency_lemmatized <- TextAnalysisR::plot_word_frequency(dfm, n = 20) ``` ```{r, echo=FALSE} word_frequency_lemmatized ``` #### Model Diagnostics (Optimal Topic Numbers) ```{r, eval=FALSE} # setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) toks_lemmatized <- TextAnalysisR::lemmatize_tokens(toks_removed_common, batch_size = 50) dfm <- dfm(toks_lemmatized) word_frequency_lemmatized <- TextAnalysisR::plot_word_frequency(dfm, n = 20) optimal_k_result <- TextAnalysisR::find_optimal_k( dfm_object = dfm, topic_range = 10:45, max.em.its = 50, categorical_var = NULL, continuous_var = NULL, height = 600, width = 800, verbose = FALSE) ``` ![](images/model_diagnostics.png){fig-align="center" width="700"} #### Structural Topic Modeling ```{r, eval=FALSE} out <- quanteda::convert(dfm, to = "stm") topic_model <- stm( data = out$meta, documents = out$documents, vocab = out$vocab, max.em.its = 75, init.type = "Spectral", K = 35, prevalence = ~ c("disability", "SLD"), verbose = FALSE, seed = 1234) top_topic_terms <- TextAnalysisR::get_topic_terms( stm_model = topic_model, top_term_n = 10, verbose = FALSE ) top_topic_terms %>% mutate_if(is.numeric, ~ round(., 3)) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` #### Generate Policy Topic Labels Using OpenAI's API ```{r, eval=FALSE} #| code-fold: false top_labeled_topic_terms <- TextAnalysisR::generate_topic_labels( top_topic_terms, model = "gpt-3.5-turbo", temperature = 0.5, verbose = FALSE) ``` ```{r, echo=FALSE} top_labeled_topic_terms |> mutate_if(is.numeric, ~ round(., 3)) |> DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) |> DT::formatStyle( columns = c("topic", "topic_label", "term", "beta"), fontSize = '16px' ) # openxlsx::write.xlsx(top_labeled_topic_terms, "data/top_labeled_topic_terms.xlsx") ``` #### Probability of Words Observed in Each Topic (Beta) ```{r, eval=FALSE} #| code-fold: false word_probability_ggplot <- TextAnalysisR::plot_word_probability( top_labeled_topic_terms, topic_label = "topic_label", ncol = 2 ) + ggplot2::theme( axis.text.x = ggplot2::element_text(size = 12), axis.text.y = ggplot2::element_text(size = 12), axis.title.x = ggplot2::element_text(size = 12), axis.title.y = ggplot2::element_text(size = 12), legend.position = "right", legend.text = ggplot2::element_text(size = 12) ) word_probability_plot <- plotly::ggplotly(word_probability_ggplot, height = 5000, width = 1500) word_probability_plot$x$layout$annotations <- lapply( word_probability_plot$x$layout$annotations, \(x) { x$font$size <- 16; x } ) word_probability_plotly <- word_probability_plot |> plotly::layout(margin = list(t = 50, b = 50, l = 80, r = 20)) ``` ```{r, echo=FALSE} word_probability_plotly ``` #### Generate Policy Survey Items Using OpenAI's API ```{r, eval=FALSE} top_labeled_human_loop <- read_excel("data/top_labeled_topic_terms_human_loop.xlsx") # Custom prompts for educational policy survey items policy_system_prompt <- " You are a highly skilled survey designer specializing in creating Likert-scale survey items for educational policy research. Your task is to generate clear, concise survey items that assess the importance of various AI policy aspects for students with learning disabilities. Guidelines for generating survey items: 1. Format and Structure - Create statements that can be rated on a 5-point Likert scale (1=Strongly Disagree to 5=Strongly Agree) - Keep items clear, concise, and focused on a single concept - Use active voice and present tense - Avoid double-barreled questions or complex statements 2. Content Focus - Focus on the importance or necessity of the policy aspect - Use person-first language (e.g., 'students with learning disabilities' instead of 'disabled students') - Ensure items are specific and actionable - Avoid jargon unless necessary for technical accuracy 3. Clarity and Precision - Use simple, direct language - Avoid ambiguous terms or complex sentence structures - Make sure each item measures exactly one concept - Ensure items are relevant to educational settings 4. Response Scale Considerations - Frame items to work well with a 5-point Likert scale - Avoid extreme language that might force responses to one end of the scale - Ensure items can be meaningfully rated on the agree-disagree continuum Example ---------- Top Terms (highest to lowest beta score): virtual manipulatives (.035) manipulatives (.022) mathematical (.014) app (.013) solving (.013) learning disability (.012) algebra (.012) area (.011) tool (.010) concrete manipulatives (.010) Generated Survey Item: AI-powered visual manipulatives should be made available to support mathematical problem-solving for students with learning disabilities. Focus on creating clear, measurable items that capture the essence of each topic while following these guidelines. " policy_user_template <- "You have a topic with keywords listed from most to least significant: {terms} Please create a single survey item that: 1. Captures the essence of this topic 2. Can be rated on a 5-point Likert scale (1=Strongly Disagree to 5=Strongly Agree) 3. Focuses on the importance of this policy aspect 4. Uses person-first language 5. Is clear, concise, and specific Survey item:" ``` ```{r, eval=FALSE} #| code-fold: false # Generate survey items using TextAnalysisR policy_survey_items <- TextAnalysisR::generate_topic_content( topic_terms_df = top_labeled_human_loop, content_type = "survey_item", topic_var = "topic", term_var = "term", weight_var = "beta", provider = "openai", model = "gpt-3.5-turbo", temperature = 0, system_prompt = policy_system_prompt, user_prompt_template = policy_user_template, max_tokens = 50, verbose = TRUE ) ``` ```{r, echo=FALSE} policy_survey_items_dist <- policy_survey_items %>% group_by(topic) %>% summarise( survey_item = first(survey_item), term_beta = paste(term, "(", round(beta, 3), ")", collapse = ", ") ) %>% ungroup() policy_survey_items_dist %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatStyle( columns = c("topic", "survey_item", "term_beta"), fontSize = '16px' ) # openxlsx::write.xlsx(policy_survey_items_dist, "data/policy_survey_items.xlsx") ``` #### Round 1 Delphi Survey Results Table Decision Rule Based on Experts' Consensus and Content Validity Ratio (CVR):\ - 75% or more of experts agree (Rating ≥ 4) & CVR \>= 0.60 → Retain\ - 70% or more of experts agree (Rating ≥ 4) & CVR \>= 0.60 → Consider\ - 75% or more of experts agree (Rating ≥ 4) & CVR \< 0.60 → Consider\ - Otherwise → Revise/Remove ```{r, eval=FALSE} delphiFirst <- read_excel("data/delphiFirst.xlsx") likert <- c( "Strongly Disagree" = 1, "Disagree" = 2, "Neutral" = 3, "Agree" = 4, "Strongly Agree" = 5 ) question_levels <- sort(as.numeric(gsub("Q", "", unique(delphiFirst %>% select(matches("^Q[0-9]+$")) %>% names())))) question_levels <- paste0("Q", question_levels) delphi_long <- delphiFirst %>% pivot_longer( cols = -c( ID, position:zip_code, starts_with("Q33_"), "Q34", "state", "census_region" ), names_to = "Question", values_to = "Rating" ) %>% mutate( Rating = likert[Rating], Question = factor(Question, levels = question_levels) ) round_1_results <- delphi_long %>% group_by(Question) %>% summarise( Total = n(), Strongly_Disagree = sum(Rating == 1), Disagree = sum(Rating == 2), Neutral = sum(Rating == 3), Agree = sum(Rating == 4), Strongly_Agree = sum(Rating == 5), Agreement_Count = sum(Rating >= 4), Percent_Agreement = round((Agreement_Count / Total) * 100, 1), Mean = round(mean(Rating), 2), SD = round(sd(Rating), 2), Essential = sum(Rating >= 4), CVR = round((Essential - (Total / 2)) / (Total / 2), 3), IQR = IQR(Rating), Consensus_Decision = case_when( Percent_Agreement >= 75 ~ "Retain", Percent_Agreement >= 70 ~ "Consider", TRUE ~ "Revise/Remove" ), CVR_Decision = ifelse(CVR >= 0.60, "Retain", "Revise/Remove"), Final_Decision = case_when( Percent_Agreement >= 75 & CVR >= 0.60 ~ "Retain", Percent_Agreement >= 70 & CVR >= 0.60 ~ "Consider", Percent_Agreement >= 75 & CVR < 0.60 ~ "Consider", TRUE ~ "Revise/Remove" ), .groups = 'drop' ) %>% ungroup() %>% mutate( Question = factor( Question, levels = Question[order(as.numeric(gsub("Q", "", Question)))] ) ) # openxlsx::write.xlsx(round_1_results, "results/round_1_results.xlsx", rowNames = FALSE) round_1_results_label <- read_excel("results/round_1_results_label.xlsx") ``` ```{r, echo=FALSE} round_1_results_label %>% dplyr::select(Question, Label) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` ```{r, echo=FALSE} round_1_results %>% dplyr::select(-any_of("tooltip_label")) %>% mutate(across(where(is.numeric), ~ round(., 2))) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` #### Round 1 Delphi Survey Results Plot ```{r, eval=FALSE} round_1_results <- round_1_results %>% mutate(Final_Decision = factor(Final_Decision, levels = c("Retain", "Consider", "Revise/Remove"))) round_1_results <- round_1_results %>% mutate( tooltip_label = paste0( "Agreement: ", Percent_Agreement, "%\n", "CVR: ", round(CVR, 2), "\n", "Final Decision: ", Final_Decision ) ) round_1_plot <- ggplot(round_1_results, aes( x = reorder(Question, Percent_Agreement), y = Percent_Agreement, fill = Final_Decision, text = tooltip_label )) + geom_col() + geom_hline(yintercept = 75, linetype = "dashed", color = "blue") + coord_flip() + labs( title = "", subtitle = "", x = "Question", y = "Percent Agreement", fill = "Final Decision" ) + scale_fill_manual(values = c( "Retain" = "darkgreen", "Consider" = "#568203", "Revise/Remove" = "orange" )) + theme_minimal(base_size = 12) + theme( axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12), legend.position = "right", legend.text = element_text(size = 12) ) round_1_plotly <- plotly::ggplotly(round_1_plot, width = 800, height = 700, tooltip = "text") %>% plotly::layout( annotations = list( list( x = 0, y = -0.16, text = "Blue line = 75% agreement threshold (Delphi consensus)", showarrow = FALSE, xref = "paper", yref = "paper", font = list(size = 16, color = "gray40") ) ), margin = list(b = 100, t = 40, l = 40) ) ``` ```{r, echo=FALSE} round_1_plotly ``` #### Round 1 Response Distribution by Question ```{r, eval=FALSE} round_1_dist <- delphi_long %>% mutate(Response_Label = case_when( Rating == 1 ~ "Strongly Disagree", Rating == 2 ~ "Disagree", Rating == 3 ~ "Neutral", Rating == 4 ~ "Agree", Rating == 5 ~ "Strongly Agree" )) %>% count(Question, Response_Label) %>% group_by(Question) %>% mutate( Percentage = round(n / sum(n) * 100, 1), Rating_Count = ifelse(Response_Label == "Strongly Agree", n, 0), Max_Rating = max(Rating_Count) ) round_1_dist <- round_1_dist %>% mutate(Response_Label = factor(Response_Label, levels = c("Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"))) round_1_dist <- round_1_dist %>% left_join( round_1_results %>% select(Question, Percent_Agreement), by = "Question" ) round_1_dist_plot <- ggplot( round_1_dist, aes(x = reorder(Question, Percent_Agreement), y = n, fill = Response_Label) ) + geom_col(position = "stack") + coord_flip() + labs( title = "Distribution of Responses by Question", x = "Question", y = "Number of Responses", fill = "Rating" ) + scale_fill_brewer(type = "div", palette = "RdYlGn") + guides(fill = guide_legend(reverse = FALSE)) + theme_minimal(base_size = 12) + theme( panel.grid.minor = element_blank(), panel.grid.major.x = element_line(color = "gray90"), panel.grid.major.y = element_line(color = "gray90"), axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12), legend.position = "right", legend.text = element_text(size = 12) ) round_1_dist_plotly <- plotly::ggplotly(round_1_dist_plot, width = 800, height = 700) ``` ```{r, echo=FALSE} round_1_dist_plotly ``` ## Round 2 #### Round 2 Delphi Survey Item Development ```{r, eval=FALSE} # setTimeLimit(cpu = Inf, elapsed = Inf, transient = FALSE) common_words_round2 <- c("also") united_tbl_round2 <- TextAnalysisR::unite_cols(delphiFirst, listed_vars = "Q34") tokens_round2 <- TextAnalysisR::prep_texts( united_tbl_round2, text_field = "united_texts", remove_stopwords = TRUE, custom_stopwords = common_words_round2, custom_valuetype = "regex", verbose = FALSE ) tokens_collocations_round2 <- TextAnalysisR::detect_multi_words(tokens_round2, size = 2:5, min_count = 2) dfm_object_round2 <- dfm(tokens_round2) toks_lemmatized_round2 <- TextAnalysisR::lemmatize_tokens(tokens_round2, batch_size = 50) dfm_round2 <- dfm(toks_lemmatized_round2) word_frequency_lemmatized_round2 <- TextAnalysisR::plot_word_frequency(dfm_round2, n = 20) optimal_k_result_round2 <- TextAnalysisR::find_optimal_k( dfm_object = dfm_round2, topic_range = 5:20, max.em.its = 75, categorical_var = NULL, continuous_var = "", height = 600, width = 800, verbose = FALSE) ``` ![](images/model_diagnostics_round_1.png){fig-align="center" width="700"} ```{r, eval=FALSE} out_round2 <- quanteda::convert(dfm_round2, to = "stm") topic_model_round_1 <- stm( data = out_round2$meta, documents = out_round2$documents, vocab = out_round2$vocab, max.em.its = 75, init.type = "Spectral", K = 11, prevalence = ~ 1, verbose = FALSE, seed = 1234) top_topic_terms_round_1 <- TextAnalysisR::get_topic_terms( stm_model = topic_model_round_1, top_term_n = 10, verbose = FALSE ) top_labeled_topic_terms_round_1 <- TextAnalysisR::generate_topic_labels( top_topic_terms_round_1 , model = "gpt-3.5-turbo", temperature = 0.5, verbose = FALSE) openxlsx::write.xlsx(top_labeled_topic_terms_round_1, "data/top_labeled_topic_terms_round_1.xlsx") ``` ```{r, echo=FALSE} top_labeled_topic_terms_round_1 |> mutate_if(is.numeric, ~ round(., 3)) |> DT::datatable( rownames = FALSE, extensions = 'Buttons', options = list( scrollX = TRUE, scrollY = "400px", width = "80%", dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print') ) ) ``` ```{r, eval=FALSE} # Generate survey items for Round 1 using TextAnalysisR policy_survey_items_round_1 <- TextAnalysisR::generate_topic_content( topic_terms_df = top_labeled_topic_terms_round_1, content_type = "survey_item", topic_var = "topic", term_var = "term", weight_var = "beta", provider = "openai", model = "gpt-3.5-turbo", temperature = 0, system_prompt = policy_system_prompt, user_prompt_template = policy_user_template, max_tokens = 50, verbose = TRUE ) policy_survey_items_round_1_td <- policy_survey_items_round_1 %>% group_by(topic) %>% summarise( survey_item = first(survey_item), term_beta = paste(term, "(", round(beta, 3), ")", collapse = ", ") ) %>% ungroup() openxlsx::write.xlsx(policy_survey_items_round_1_td, "data/policy_survey_items_round_1_td.xlsx") ``` ```{r, echo=FALSE} policy_survey_items_round_1_td %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) %>% DT::formatStyle( columns = c("topic", "survey_item", "term_beta"), fontSize = '16px' ) ``` #### Round 2 Delphi Survey Results Table Decision Rule Based on Experts' Consensus and Content Validity Ratio (CVR):\ - 75% or more of experts agree (Rating ≥ 4) & CVR \>= 0.60 → Retain\ - 70% or more of experts agree (Rating ≥ 4) & CVR \>= 0.60 → Consider\ - 75% or more of experts agree (Rating ≥ 4) & CVR \< 0.60 → Consider\ - Otherwise → Revise/Remove ```{r, eval=FALSE} delphiSecond <- read_excel("data/delphiSecond.xlsx") likert <- c( "Strongly Disagree" = 1, "Disagree" = 2, "Neutral" = 3, "Agree" = 4, "Strongly Agree" = 5 ) question_levels <- sort(as.numeric(gsub("Q", "", unique(delphiSecond %>% select(matches("^Q[0-9]+$")) %>% names())))) question_levels <- paste0("Q", question_levels) delphi_long_2 <- delphiSecond %>% pivot_longer( cols = -c( ID, starts_with("Q15_"), "Q16" ), names_to = "Question", values_to = "Rating" ) %>% mutate( Rating = likert[Rating], Question = factor(Question, levels = question_levels) ) round_2_results <- delphi_long_2 %>% group_by(Question) %>% summarise( Total = n(), Strongly_Disagree = sum(Rating == 1), Disagree = sum(Rating == 2), Neutral = sum(Rating == 3), Agree = sum(Rating == 4), Strongly_Agree = sum(Rating == 5), Agreement_Count = sum(Rating >= 4), Percent_Agreement = round((Agreement_Count / Total) * 100, 1), Mean = round(mean(Rating), 2), SD = round(sd(Rating), 2), Essential = sum(Rating >= 4), CVR = round((Essential - (Total / 2)) / (Total / 2), 3), IQR = IQR(Rating), Consensus_Decision = case_when( Percent_Agreement >= 75 ~ "Retain", Percent_Agreement >= 70 ~ "Consider", TRUE ~ "Revise/Remove" ), CVR_Decision = ifelse(CVR >= 0.60, "Retain", "Revise/Remove"), Final_Decision = case_when( Percent_Agreement >= 75 & CVR >= 0.60 ~ "Retain", Percent_Agreement >= 70 & CVR >= 0.60 ~ "Consider", Percent_Agreement >= 75 & CVR < 0.60 ~ "Consider", TRUE ~ "Revise/Remove" ), .groups = 'drop' ) %>% ungroup() %>% mutate( Question = factor( Question, levels = Question[order(as.numeric(gsub("Q", "", Question)))] ) ) # openxlsx::write.xlsx(round_2_results, "results/round_2_results.xlsx", rowNames = FALSE) round_2_results_label <- read_excel("results/round_2_results_label.xlsx") ``` ```{r, echo=FALSE} round_2_results_label %>% dplyr::select(Question, Label) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` ```{r, echo=FALSE} round_2_results %>% dplyr::select(-any_of("tooltip_label")) %>% mutate(across(where(is.numeric), ~ round(., 2))) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf'), pageLength = 10, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` #### Round 2 Delphi Survey Results Plot ```{r, eval=FALSE} round_2_results <- round_2_results %>% mutate(Final_Decision = factor(Final_Decision, levels = c("Retain", "Consider", "Revise/Remove"))) round_2_results <- round_2_results %>% mutate( tooltip_label = paste0( "Agreement: ", Percent_Agreement, "%\n", "CVR: ", round(CVR, 2), "\n", "Final Decision: ", Final_Decision ) ) round_2_plot <- ggplot(round_2_results, aes( x = reorder(Question, Percent_Agreement), y = Percent_Agreement, fill = Final_Decision, text = tooltip_label )) + geom_col() + geom_hline(yintercept = 75, linetype = "dashed", color = "blue") + coord_flip() + labs( title = "", subtitle = "", x = "Question", y = "Percent Agreement", fill = "Final Decision" ) + scale_fill_manual(values = c( "Retain" = "darkgreen", "Consider" = "#568203", "Revise/Remove" = "orange" )) + theme_minimal(base_size = 12) + theme( axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12), legend.position = "right", legend.text = element_text(size = 12) ) round_2_plotly <- plotly::ggplotly(round_2_plot, width = 800, height = 600, tooltip = "text") %>% plotly::layout( annotations = list( list( x = 0, y = -0.22, text = "Blue line = 75% agreement threshold (Delphi consensus)", showarrow = FALSE, xref = "paper", yref = "paper", font = list(size = 16, color = "gray40") ) ), margin = list(b = 120, t = 40, l = 40) ) ``` ```{r, echo=FALSE} round_2_plotly ``` #### Round 2 Response Distribution by Question ```{r, eval=FALSE} round_2_dist <- delphi_long_2 %>% mutate(Response_Label = case_when( Rating == 1 ~ "Strongly Disagree", Rating == 2 ~ "Disagree", Rating == 3 ~ "Neutral", Rating == 4 ~ "Agree", Rating == 5 ~ "Strongly Agree" )) %>% count(Question, Response_Label) %>% group_by(Question) %>% mutate( Percentage = round(n / sum(n) * 100, 1), Rating_Count = ifelse(Response_Label == "Strongly Agree", n, 0), Max_Rating = max(Rating_Count) ) round_2_dist <- round_2_dist %>% mutate(Response_Label = factor(Response_Label, levels = c("Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"))) round_2_dist <- round_2_dist %>% left_join( round_2_results %>% select(Question, Percent_Agreement), by = "Question" ) round_2_dist_plot <- ggplot( round_2_dist, aes(x = reorder(Question, Percent_Agreement), y = n, fill = Response_Label) ) + geom_col(position = "stack") + coord_flip() + labs( title = "Distribution of Responses by Question", x = "Question", y = "Number of Responses", fill = "Rating" ) + scale_fill_brewer(type = "div", palette = "RdYlGn") + guides(fill = guide_legend(reverse = FALSE)) + theme_minimal(base_size = 12) + theme( panel.grid.minor = element_blank(), panel.grid.major.x = element_line(color = "gray90"), panel.grid.major.y = element_line(color = "gray90"), axis.text.x = element_text(size = 12), axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12), legend.position = "right", legend.text = element_text(size = 12) ) round_2_dist_plotly <- plotly::ggplotly(round_2_dist_plot, width = 800, height = 600) ``` ```{r, echo=FALSE} round_2_dist_plotly ``` #### Round 2 Delphi Survey Ranking of Policy Items ```{r, eval=FALSE} ranking_long <- delphiSecond %>% pivot_longer( cols = starts_with("Q15_"), names_to = "rank_position", values_to = "item_selected", names_pattern = "Q15_(\\d+)" ) %>% mutate(rank_position = as.numeric(rank_position)) %>% filter(!is.na(item_selected)) item_percentages <- ranking_long %>% group_by(item_selected) %>% summarise( Times_Selected = n(), Percentage = round((Times_Selected / 14) * 100, 1), .groups = 'drop' ) %>% arrange(desc(Percentage)) ``` ```{r, echo=FALSE} delphiSecondRanking <- read_excel("data/delphiSecondRanking.xlsx") delphiSecondRanking %>% arrange(desc(Percentage)) %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 5, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` ## Final Policy Items ```{r, echo=FALSE} final_policy_items <- read_excel("data/final_policy_items.xlsx") final_policy_items %>% DT::datatable( options = list( dom = 'Bfrtip', buttons = c('copy', 'excel', 'pdf'), pageLength = 20, scrollX = TRUE ), extensions = 'Buttons', class = "cell-border stripe", rownames = FALSE ) ``` ```{r, echo=FALSE, eval=FALSE} save(document_summary, single_tbl_ft, multi_tbl_ft, all_labeled_topics, ld_similarities, similarity_stats, LD_Unique, missing_from_ld, cross_policy, similarity_heatmap, similarity_heatmap_plotly, dfm_ld, dfm_disability, dfm_general, topic_model_ld, topic_model_disability, topic_model_general, topic_similarities, topic_similarity_stats, topic_similarity_heatmap, topic_similarity_heatmap_plotly, LD_Unique_topics, missing_from_ld_topics, cross_policy_topics, delphi_long, delphi_long_2, dfm, word_frequency_plot, word_frequency_remove_stopwords, word_frequency_remove_common, word_frequency_lemmatized, toks_removed_common, toks_removed, toks_lemmatized, dfm_object, dfm_removed, dfm_init, tokens_compound, tokens_collocations, tokens_dict, tokens, united_tbl, data_long, policy_survey_items, policy_survey_items_dist, out, top_labeled_topic_terms, word_probability_plot, word_probability_plotly, topic_model, round_1_results, round_1_results_label, round_1_plot, round_1_plotly, round_1_dist, round_1_dist_plot, round_1_dist_plotly, topic_model_round_1, top_labeled_topic_terms_round_1, policy_survey_items_round_1_td, round_2_results, round_2_results_label, round_2_plot, round_2_plotly, round_2_dist, round_2_dist_plot, round_2_dist_plotly, item_percentages, # Round 2 specific variables united_tbl_round2, tokens_round2, tokens_collocations_round2, dfm_object_round2, toks_lemmatized_round2, dfm_round2, out_round2, word_frequency_lemmatized_round2, file = "data/ai_ld_data.rda") ``` :::