Exploring the Research Landscape on Single-Case Design Methodology Using Technology Through Text Mining and Large Language Models

Published

April 1, 2026

This website contains Data and R and Python code used for the analyses in Shin and McKenna (in press). The scripts were also posted through an online data repository at Center for Open Science and GitHub.

Shin, M., & McKenna, J. (in press). Exploring the research landscape on single-case design methodology using technology through text mining and large language models. Journal of Behavioral Education.

Pipeline

Figure 1. Semi-Automated Publication Screening

Semi-Automated Publication Screening

Search Strategy

View search terms

Web of Science (All Fields): Single-case design (18 terms) AND technology terms (49 terms)

Setup Environment

suppressPackageStartupMessages({
  library(tidyverse)
  library(knitr)
  library(readxl)
  library(data.table)
  library(broom.mixed)
  library(DT)
  library(bibliometrix)
  library(openxlsx)
  library(sf)
  library(rnaturalearth)
  library(ggrepel)
  library(ggiraph)
  library(stringi)
  library(brms)
  library(performance)
  library(visNetwork)
  library(htmltools)
  library(RColorBrewer)
  library(flextable)
})

load("evolution/bayes_model.RData")

Extract Web of Science (WoS) Data

wos_1_500 <- convert2df("1_500.ciw", dbsource = "wos", format = "endnote")
wos_501_1000 <- convert2df("501_1000.ciw", dbsource = "wos", format = "endnote")
wos_1001_1500 <- convert2df("1001_1500.ciw", dbsource = "wos", format = "endnote")
wos_1501_2000 <- convert2df("1501_2000.ciw", dbsource = "wos", format = "endnote")
wos_2001_2500 <- convert2df("2001_2500.ciw", dbsource = "wos", format = "endnote")
wos_2501_3000 <- convert2df("2501_3000.ciw", dbsource = "wos", format = "endnote")
wos_3001_3500 <- convert2df("3001_3500.ciw", dbsource = "wos", format = "endnote")
wos_3501_4000 <- convert2df("3501_4000.ciw", dbsource = "wos", format = "endnote")
wos_4001_4500 <- convert2df("4001_4500.ciw", dbsource = "wos", format = "endnote")
wos_4501_5000 <- convert2df("4501_5000.ciw", dbsource = "wos", format = "endnote")
wos_5001_5500 <- convert2df("5001_5500.ciw", dbsource = "wos", format = "endnote")
wos_5501_5935 <- convert2df("5501_5935.ciw", dbsource = "wos", format = "endnote")
Code
wos_5935 <- bind_rows(wos_1_500,
                       wos_501_1000,
                       wos_1001_1500,
                       wos_1501_2000,
                       wos_2001_2500,
                       wos_2501_3000,
                       wos_3001_3500,
                       wos_3501_4000,
                       wos_4001_4500,
                       wos_4501_5000,
                       wos_5001_5500,
                       wos_5501_5935)

wos <- wos_5935 %>% 
  filter(PY >= 1970)

write.csv(wos, "init_all_data.csv", row.names = FALSE)

Model Training and Validation for Document Classification Tasks

A Transformer-MLP classifier (pubmlp) combined transformer embeddings with categorical and numeric features for binary classification. The model was trained on human-labeled records and used to predict labels for unlabeled records. Classification was performed sequentially: first for single-case design, then for technology use.

Step 1. Text classification for single-case design

Code

Figure 2 (a). Training Loss and Accuracy

Step 2. Text classification for technology use

Code

Figure 2 (b). Training Loss and Accuracy

Step 3. Filtering studies including methodology content

Code

Employed a list of 77 wildcard expressions using regular expressions to filter methodology content. See Code for the full list.

Descriptive Bibliometric Analysis

Main Findings

Code
data <- read.xlsx("../data/df.xlsx")

data <- data %>% filter(filtered == "Yes")

data_descriptive <- data %>%
  dplyr::mutate(Study = paste("study", row_number(), sep = "")) %>%
  dplyr::mutate(document = paste(Study, PY, sep = "_")) %>%
  ungroup()

data_descriptive <- data_descriptive %>%
  dplyr::select(document, everything())

data_descriptive$DT <- case_when(
  data_descriptive$DT == "ARTICLE" ~ "Article",
  data_descriptive$DT == "BOOK CHAPTER" ~ "Book Chapter",
  data_descriptive$DT == "ARTICLE; BOOK CHAPTER" ~ "Book Chapter",
  data_descriptive$DT == "REVIEW; BOOK CHAPTER" ~ "Book Chapter",
  data_descriptive$DT == "ARTICLE; DATA PAPER" ~ "Article",
  data_descriptive$DT == "EARLY ACCESS" ~ "Early Access",
  data_descriptive$DT == "ARTICLE; EARLY ACCESS" ~ "Article",
  data_descriptive$DT == "REVIEW; EARLY ACCESS" ~ "Review",
  data_descriptive$DT == "PROCEEDINGS PAPER" ~ "Proceeding Paper",
  data_descriptive$DT == "ARTICLE; PROCEEDINGS PAPER" ~ "Article",
  data_descriptive$DT == "REVIEW" ~ "Review",
  data_descriptive$DT == "EDITORIAL MATERIAL" ~ "Editorial Material",
  data_descriptive$DT == "MEETING ABSTRACT" ~ "Meeting Abstract",
  TRUE ~ as.character(data_descriptive$DT)
)

source_hindex <- bibliometrix::Hindex(data_descriptive, years = Inf, field = "source")$H
source_hindex$SO <- source_hindex$Element
source_hindex$m_index <- round(source_hindex$m_index, 2)

source_TC <- data_descriptive %>%
  dplyr::group_by(SO) %>%
  dplyr::summarize(total_citation = sum(TC)) %>%
  dplyr::mutate(total_citation_percent = round((total_citation / sum(total_citation) * 100),2)) %>%
  ungroup()

source_TP <- data_descriptive %>%
  dplyr::group_by(SO) %>%
  dplyr::summarize(publication_number = n()) %>%
  dplyr::mutate(publication_number_percent = round((publication_number / sum(publication_number) * 100),2)) %>%
  ungroup()

source_detail <- left_join(source_TC, source_TP, by = "SO")

source_detail <- source_detail %>%
  group_by(SO) %>%
  dplyr::mutate(citation_per_publication = round((total_citation / publication_number),0))

source_detail <- source_detail %>% arrange(desc(total_citation), desc(publication_number))
source_combined <- left_join(source_detail, source_hindex, by = "SO")
source_combined <- source_combined %>%
  arrange(desc(total_citation), desc(publication_number)) %>%
  select(SO, PY_start, publication_number, publication_number_percent,
         total_citation, total_citation_percent,
         citation_per_publication,
         h_index, g_index, m_index)


write.xlsx(source_combined, file = "files/source_combined.xlsx", colNames = TRUE)

results <- biblioAnalysis(data_descriptive)
descriptive_summary <- summary(results, k=71, pause=F, width=130)
MainInformationDF <- descriptive_summary$MainInformationDF

write.xlsx(MainInformationDF, file = "files/MainInformationDF.xlsx", colNames = TRUE)

MostProdCountries <- descriptive_summary$MostProdCountries

write.xlsx(MostProdCountries, file = "files/MostProdCountries.xlsx", colNames = TRUE)

TCperCountries <- descriptive_summary$TCperCountries

write.xlsx(TCperCountries, file = "files/TCperCountries.xlsx", colNames = TRUE)

Map Publications and Citations Across Countries

Code
MostProdCountries <- MostProdCountries %>%
  mutate(
    Articles = as.numeric(Articles),
    Freq = as.numeric(Freq),
    SCP = as.numeric(SCP),
    MCP = as.numeric(MCP),
    MCP_Ratio = as.numeric(MCP_Ratio)
  )

TCperCountries <- TCperCountries %>%
  mutate(
    `Total Citations` = as.numeric(`Total Citations`),
    `Average Article Citations` = as.numeric(`Average Article Citations`)
  )

colnames(TCperCountries) <- stringr::str_trim(colnames(TCperCountries))

Countries_Prod_TC <- MostProdCountries %>%
  left_join(TCperCountries, by = "Country")

Countries_Prod_TC <- Countries_Prod_TC %>%
  mutate(Country = str_trim(Country, "both") %>% str_to_title())

Countries_Prod_TC <- Countries_Prod_TC %>%
  mutate(Country = case_when(
    Country == "Usa" ~ "USA",
    Country == "Korea" ~ "South Korea",
    Country == "U Arab Emirates" ~ "UAE",
    Country == "United Kingdom" ~ "UK",
    TRUE ~ as.character(Country)
  ))

write.xlsx(Countries_Prod_TC, file = "files/Countries_Prod_TC.xlsx", colNames = TRUE)

sf::sf_use_s2(TRUE)

world <- ne_countries(scale = "small", returnclass = "sf")

world <- world %>%
  mutate(admin = case_when(
    admin == "United States of America" ~ "USA",
    admin == "United Arab Emirates" ~ "UAE",
    admin == "United Kingdom" ~ "UK",
    admin == "United Republic of Tanzania" ~ "Tanzania",
    admin == "Republic of Serbia" ~ "Serbia",
    TRUE ~ as.character(admin)
  ))

wintr_proj <- "+proj=wintri +datum=WGS84 +no_defs +over"

if (is.na(st_crs(world))) {
  st_crs(world) <- 4326
}

world_projected <- lwgeom::st_transform_proj(world, crs = wintr_proj)
centroids_projected <- st_centroid(st_geometry(world_projected))
centroids_geo <- st_transform(centroids_projected, crs = st_crs(world))
world_centroids <- cbind(world, st_coordinates(centroids_geo))
world_joined <- left_join(world, Countries_Prod_TC, by = c("admin" = "Country"))
world_joined_filtered <- world_joined %>% filter(!is.na(Articles))
world_joined_filtered <- world_joined_filtered %>%
  mutate(country_article = paste(admin, Articles))

world_sf <- st_as_sf(world_joined_filtered, coords = c("longitude", "latitude"), crs = "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")

b <- c(200, 2000, 5000, 10000, 20000, 40000, 60000, 70000)

world_map_init <- world_sf %>%
  ggplot() +
  geom_sf_interactive(aes(fill = `Total Citations`,
                          tooltip = paste("TC: ", admin, `Total Citations`)),
                      size = 0.25) +
  geom_label_repel(
    size = 3.3,
    aes(geometry = geometry, label = admin),
    alpha = 0.7,
    max.overlaps = 13,
    stat = "sf_coordinates"
  ) +
  scale_fill_viridis_c(trans = "sqrt", na.value = "white", breaks = b, direction = -1, alpha = 0.9, begin = 0.6, end = 1) +
  geom_point_interactive(
    aes(size = Articles,
        geometry = geometry,
        tooltip = paste("TP: ", admin, Articles)),
    stat = "sf_coordinates",
    stroke = 1,
    shape = 21,
    color = "#A84268",
    alpha = 0.6
  ) +
  theme_void() +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
  ) +
  labs(fill = "Total citations (TC)", size = "Total publications (TP)") +
  guides(
    fill = guide_colourbar(title.position = "top", title.hjust = 0.5, barwidth = 18, barheight = 1),
    size = guide_legend(title.position = "top", title.hjust = 0.5)
  )

world_map <- girafe(ggobj = world_map_init, width_svg = 10, height_svg = 5) %>%
  girafe_options(
    opts_hover(css = "fill:cyan;"),
    opts_sizing(rescale = TRUE),
    css = "div.girafe-container { padding-top: 0px; margin-top: -20px; }
           .ggiraph-toolbar { position: absolute; bottom: 10px; right: 10px; }"
  )

htmlwidgets::saveWidget(
  widget = world_map,
  file = "world_map/index.html",
  selfcontained = TRUE)

ggsave('files/world_map.png',
       width = 7, height = 3.5, dpi = 300, plot = world_map_init)

save(world_map, file = "files/world_map.RData")

Figure 3. Map of Publications and Citations Across Countries

Interactive Map

Contextualized Topic Modeling and LLM-Assisted Topic Labeling

Code | Figure 4 | Figure 5

Word Co-Occurrence Network Analysis

Interactive Word Networks

Web APP

Word Usage Summary

Code
message("\nGenerating word usage summary...")

df_summary <- read_excel("../data/df.xlsx", col_types = "text") %>%
  mutate(
    Year = as.numeric(Year),
    across(c(Topic, CustomLabel, unigrams, bigrams, trigrams), as.character)
  )

extract_ngrams <- function(data, type) {
  data %>%
    select(UT, WC, Year, Topic, tokens = all_of(type)) %>%
    filter(!is.na(tokens), !tokens %in% c("", "[]", "['']", '[""]')) %>%
    mutate(
      tokens = str_replace_all(tokens, "^\\[|\\]$", ""),  
      tokens = str_replace_all(tokens, "^'|'$", ""),      
      tokens = str_replace_all(tokens, '^"|"$', ""),      
      tokens = str_replace_all(tokens, "\\'", ""),       
      tokens = str_replace_all(tokens, '\\"', ""),         
      tokens = str_split(tokens, ",\\s*")
    ) %>%
    unnest(tokens) %>%
    filter(tokens != "", !is.na(tokens)) %>%
    mutate(
      tokens = str_trim(tokens),
      tokens = str_replace_all(tokens, "^'|'$", ""),
      tokens = str_replace_all(tokens, '^"|"$', ""),
      tokens = str_replace_all(tokens, "\\'", ""),
      tokens = str_replace_all(tokens, '\\"', "")
    ) %>%
    filter(tokens != "", !is.na(tokens)) %>%
    rename(ngram = tokens) %>%
    mutate(
      ngram_type = type,
      ngram = str_trim(ngram)
    )
}

ngram_types    <- c("unigrams", "bigrams", "trigrams")
available      <- intersect(ngram_types, names(df_summary))
ngrams_concat  <- map_dfr(available, ~ extract_ngrams(df_summary, .x))

unigrams_df <- ngrams_concat %>%
  filter(ngram_type == "unigrams") %>%
  mutate(
    Decade = paste0(floor(Year/10)*10, "s")
  )

first_year <- unigrams_df %>%
  group_by(ngram) %>%
  summarize(Year0 = min(Year, na.rm = TRUE), .groups = "drop") %>%
  mutate(FirstDecade = paste0(floor(Year0/10)*10, "s"))

unigrams_df <- unigrams_df %>%
  left_join(first_year %>% select(ngram, FirstDecade), by = "ngram")

write.xlsx(unigrams_df, "evolution/unigrams_df.xlsx", rowNames = FALSE)

process_tokens <- function(tokens) {
  tokens_clean <- tokens[!is.na(tokens)]
  tokens_clean <- str_trim(tokens_clean)
  tokens_clean <- tokens_clean[tokens_clean != ""]
  
  if (length(tokens_clean) == 0) {
    return(tibble::tibble(
      Total_Word_Count      = 0L,
      Min                   = NA_integer_,
      Q1                    = NA_real_,
      Median                = NA_real_,
      Mean                  = NA_real_,
      Q3                    = NA_real_,
      Max                   = NA_integer_,
      Skew                  = NA_real_,
      Kurt                  = NA_real_,
      Distinct_Word_Count   = 0L,
      Count_Once            = 0L,
      Count_More_Than_Once  = 0L
    ))
  }
  
  freq <- as.integer(table(tokens_clean))
  
  skew_val <- if (length(freq) > 1 && var(freq) > 0) round(moments::skewness(freq), 2) else NA_real_
  kurt_val <- if (length(freq) > 1 && var(freq) > 0) round(moments::kurtosis(freq), 2) else NA_real_
  
  tibble::tibble(
    Total_Word_Count        = sum(freq),
    Mean                    = round(mean(freq), 2),
    SD                      = round(sd(freq), 2),
    Min                     = min(freq),
    `25th pct`  = as.numeric(quantile(freq, .25)),
    `50th pct`      = median(freq),
    `75th pct`  = as.numeric(quantile(freq, .75)),
    Max                     = max(freq),
    Skew                    = skew_val,
    Kurt                    = kurt_val
    # Distinct_Word_Count   = length(freq),
    # Count_Once            = sum(freq == 1L),
    # Count_More_Than_Once  = sum(freq > 1L)
  )
}

decades <- unigrams_df %>%
  pull(Decade) %>%
  unique() %>%
  na.omit() %>%
  sort()

decade_stats <- purrr::map_dfr(decades, function(d) {
  sub <- dplyr::filter(unigrams_df, Decade == d)
  stats <- process_tokens(sub$ngram)
  
  topic_stats <- sub %>%
    group_by(Topic) %>%
    summarise(
      topic_word_count = n(),
      topic_new_words = sum(FirstDecade == d),
      topic_reused_words = sum(FirstDecade < d),
      .groups = "drop"
    )
  
  reused <- sub %>%
    filter(!is.na(FirstDecade) & FirstDecade < d) %>%
    pull(ngram)
  reused_tab <- sort(table(reused), decreasing = TRUE)
  
  new <- sub %>%
    filter(FirstDecade == d) %>%
    pull(ngram)
  new_tab <- sort(table(new), decreasing = TRUE)
  
  stats %>%
    mutate(
      Decade = d,
      `All words` = Total_Word_Count,
      `Reused words` = sum(reused_tab),
      `New words` = sum(new_tab),
      Topic = nrow(topic_stats),
      `Top 10 reused words` = if (length(reused_tab)) 
        paste(names(head(reused_tab, 10)), collapse = ", ")
      else 
        NA_character_,
      `Top 10 new words` = if (length(new_tab)) 
        paste(names(head(new_tab, 10)), collapse = ", ")
      else 
        NA_character_
    ) %>%
    select(
      Decade,
      `All words`,
      `Reused words`,
      `New words`,
      Topic,
      `Top 10 reused words`,
      `Top 10 new words`,
      Mean,
      SD,
      Min,
      `25th pct`,
      `50th pct`,
      `75th pct`,
      Max,
      Skew,
      Kurt
      # Distinct_Word_Count,
      # Count_Once,
      # Count_More_Than_Once
    )
})

topic_decade_stats <- unigrams_df %>%
  group_by(Topic, Decade) %>%
  summarise(
    Word_Count = n(),
    New_Words = sum(FirstDecade == Decade),
    Reused_Words = sum(FirstDecade < Decade),
    .groups = "drop"
  ) %>%
  arrange(Topic, Decade)

word_usage_df <- decade_stats %>%
  mutate(across(-Decade, as.character)) %>%         
  tidyr::pivot_longer(
    cols = -Decade,
    names_to = "Metric",
    values_to = "Value"
  ) %>%
  tidyr::pivot_wider(
    names_from = Decade,
    values_from = Value
  )

openxlsx::write.xlsx(
  list(
    "Decade_Summary" = word_usage_df,
    "Topic_Decade_Summary" = topic_decade_stats
  ),
  "evolution/word_usage_summary.xlsx"
)

word_usage_table <- DT::datatable(
  word_usage_df,
  options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'csv', 'excel', 'pdf'),
    pageLength = 10,
    scrollX = TRUE
  ),
  extensions = 'Buttons',
  class = "cell-border stripe",
  rownames = FALSE
)

count_new_word_topic_year <- function(df) {
  df %>%
    group_by(Topic, Year) %>%
    summarise(ngram = list(unique(ngram)), .groups = "drop") %>%
    arrange(Topic, Year) %>%
    group_by(Topic) %>%
    mutate(
      cum_union       = purrr::accumulate(ngram, union),
      previous_words = dplyr::lag(cum_union, default = list(character(0))),
      new_words      = purrr::map2(ngram, previous_words, setdiff),
      Count           = purrr::map_int(new_words, length)
    ) %>%
    select(Topic, Year, Count)
}

new_word_count_df <- count_new_word_topic_year(unigrams_df)
topic_levels <- unique(new_word_count_df$Topic)
topic_levels <- topic_levels[order(as.numeric(topic_levels))]
new_word_count_df$Topic <- factor(new_word_count_df$Topic, levels = topic_levels)
new_word_count_df$Decade <- paste0(floor(new_word_count_df$Year/10)*10, "s")

write.xlsx(new_word_count_df, file = "evolution/new_word_count_df.xlsx", colNames = TRUE)

Bayesian Negative Binomial Piecewise Regression

Dispersion Ratio

Code
mean_val <- mean(new_word_count_df$Count)
var_val  <- var(new_word_count_df$Count)
overdispersion_ratio <- var_val / mean_val
overdispersion_ratio
[1] 87.501

Preprocess

new_word_count_data <- new_word_count_df %>%
  group_by(Topic) %>%
  mutate(
    year_c = Year - min(Year, na.rm = TRUE),
    post_2010 = as.integer(Year >= 2010),
    post_2010_years = (Year - 2010) * post_2010
  ) %>%
  ungroup()

Estimate the Model

Code
new_word_count_data$Topic <- as.factor(new_word_count_data$Topic)
new_word_count_data$Topic <- relevel(new_word_count_data$Topic, ref = "4")

formula_new_word <- bf(
  Count ~ 1 + year_c + post_2010 + post_2010_years +
    Topic + year_c:Topic + post_2010:Topic + post_2010_years:Topic
)

priors_new_word <- c(
  prior(normal(0, 5), class = "Intercept"),
  prior(normal(0, 2), class = "b")
)

model_new_word <- brm(
  formula = formula_new_word,
  data = new_word_count_data,
  family = negbinomial(link = "log"),
  prior = priors_new_word,
  control = list(adapt_delta = 0.95, max_treedepth = 12),
  iter = 2000,
  warmup = 1000,
  cores = 4,
  chains = 4,
  seed = 2025
)

coefs <- fixef(model_new_word)

topic_labels_df <- readxl::read_excel("../data/df.xlsx", col_types = "text") %>%
  select(Topic, CustomLabel) %>%
  distinct()

topic_starts <- new_word_count_df %>%
  group_by(Topic) %>%
  summarise(First_Year = as.character(min(Year, na.rm = TRUE)), .groups = "drop")
 Family: negbinomial 
  Links: mu = log 
Formula: Count ~ 1 + year_c + post_2010 + post_2010_years + Topic + year_c:Topic + post_2010:Topic + post_2010_years:Topic 
   Data: new_word_count_data (Number of observations: 526) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
Intercept                   4.18      0.20     3.79     4.57 1.00     1901
year_c                      0.01      0.01    -0.01     0.03 1.00     1666
post_2010                   0.19      0.30    -0.39     0.78 1.00     1721
post_2010_years             0.00      0.04    -0.07     0.07 1.00     1477
Topic1                     -0.12      0.32    -0.72     0.53 1.00     2723
Topic2                     -0.56      0.36    -1.26     0.16 1.00     3009
Topic3                      0.06      0.44    -0.77     0.95 1.00     3317
Topic5                     -1.01      0.38    -1.74    -0.25 1.00     3294
Topic6                     -1.92      0.61    -3.06    -0.71 1.00     4500
Topic7                      0.46      0.36    -0.26     1.17 1.00     2885
Topic8                     -0.49      0.53    -1.46     0.57 1.00     3999
Topic9                     -2.19      0.40    -2.95    -1.37 1.00     3678
Topic10                     0.18      0.36    -0.51     0.88 1.00     3164
Topic11                    -0.15      0.48    -1.07     0.83 1.00     4067
Topic12                    -0.08      0.36    -0.76     0.65 1.00     3359
Topic13                    -1.24      0.66    -2.48     0.07 1.00     4246
Topic14                    -1.11      0.61    -2.29     0.13 1.00     3808
Topic15                     0.04      0.56    -1.00     1.18 1.00     3892
year_c:Topic1               0.02      0.01    -0.01     0.05 1.00     2615
year_c:Topic2               0.01      0.02    -0.02     0.05 1.00     2809
year_c:Topic3              -0.01      0.02    -0.05     0.03 1.00     3188
year_c:Topic5               0.04      0.02     0.01     0.07 1.00     2754
year_c:Topic6               0.07      0.02     0.02     0.11 1.00     3838
year_c:Topic7              -0.03      0.02    -0.07    -0.00 1.00     2732
year_c:Topic8               0.01      0.03    -0.04     0.06 1.00     4005
year_c:Topic9               0.06      0.02     0.02     0.09 1.00     3122
year_c:Topic10             -0.01      0.02    -0.04     0.03 1.00     3180
year_c:Topic11             -0.01      0.03    -0.07     0.04 1.00     4516
year_c:Topic12             -0.03      0.02    -0.07     0.00 1.00     3249
year_c:Topic13              0.02      0.03    -0.03     0.07 1.00     3679
year_c:Topic14              0.03      0.03    -0.02     0.08 1.00     3835
year_c:Topic15              0.02      0.05    -0.08     0.11 1.00     4136
post_2010:Topic1           -0.08      0.47    -1.00     0.85 1.00     3015
post_2010:Topic2            0.63      0.47    -0.29     1.54 1.00     2861
post_2010:Topic3            0.51      0.50    -0.46     1.47 1.00     3000
post_2010:Topic5            0.21      0.48    -0.74     1.14 1.00     2851
post_2010:Topic6           -0.91      0.50    -1.92     0.07 1.00     3014
post_2010:Topic7            0.42      0.48    -0.52     1.38 1.00     2820
post_2010:Topic8           -0.01      0.51    -1.01     0.98 1.00     2802
post_2010:Topic9           -0.49      0.49    -1.43     0.49 1.00     2852
post_2010:Topic10          -0.27      0.51    -1.26     0.72 1.00     3096
post_2010:Topic11          -0.17      0.56    -1.30     0.93 1.00     3706
post_2010:Topic12          -0.10      0.50    -1.04     0.90 1.00     3121
post_2010:Topic13           0.38      0.59    -0.78     1.51 1.00     3544
post_2010:Topic14          -0.47      0.54    -1.51     0.61 1.00     3183
post_2010:Topic15          -0.07      0.60    -1.27     1.08 1.00     3322
post_2010_years:Topic1      0.04      0.05    -0.07     0.13 1.00     2201
post_2010_years:Topic2     -0.01      0.05    -0.11     0.08 1.00     2294
post_2010_years:Topic3      0.11      0.05     0.00     0.21 1.00     2389
post_2010_years:Topic5     -0.01      0.05    -0.12     0.09 1.00     2326
post_2010_years:Topic6     -0.02      0.05    -0.12     0.09 1.00     2396
post_2010_years:Topic7      0.06      0.05    -0.05     0.16 1.00     2210
post_2010_years:Topic8      0.02      0.05    -0.09     0.13 1.00     2219
post_2010_years:Topic9     -0.02      0.05    -0.12     0.08 1.00     2331
post_2010_years:Topic10     0.04      0.06    -0.07     0.15 1.00     2447
post_2010_years:Topic11     0.10      0.06    -0.02     0.22 1.00     2840
post_2010_years:Topic12     0.15      0.05     0.04     0.25 1.00     2487
post_2010_years:Topic13    -0.06      0.06    -0.17     0.05 1.00     2360
post_2010_years:Topic14    -0.02      0.06    -0.13     0.09 1.00     2058
post_2010_years:Topic15    -0.05      0.07    -0.18     0.09 1.00     2773
                        Tail_ESS
Intercept                   2611
year_c                      2215
post_2010                   2289
post_2010_years             2106
Topic1                      2863
Topic2                      2799
Topic3                      2841
Topic5                      3106
Topic6                      3114
Topic7                      3006
Topic8                      3011
Topic9                      3227
Topic10                     2636
Topic11                     3144
Topic12                     2914
Topic13                     2937
Topic14                     3102
Topic15                     2876
year_c:Topic1               2707
year_c:Topic2               3081
year_c:Topic3               2419
year_c:Topic5               3107
year_c:Topic6               2815
year_c:Topic7               3017
year_c:Topic8               2926
year_c:Topic9               3100
year_c:Topic10              3084
year_c:Topic11              3087
year_c:Topic12              2788
year_c:Topic13              2866
year_c:Topic14              2769
year_c:Topic15              2983
post_2010:Topic1            2792
post_2010:Topic2            2833
post_2010:Topic3            2927
post_2010:Topic5            2992
post_2010:Topic6            3072
post_2010:Topic7            3174
post_2010:Topic8            2993
post_2010:Topic9            3057
post_2010:Topic10           2807
post_2010:Topic11           3562
post_2010:Topic12           3104
post_2010:Topic13           3551
post_2010:Topic14           3131
post_2010:Topic15           2886
post_2010_years:Topic1      3027
post_2010_years:Topic2      2742
post_2010_years:Topic3      3027
post_2010_years:Topic5      2614
post_2010_years:Topic6      2891
post_2010_years:Topic7      2883
post_2010_years:Topic8      2975
post_2010_years:Topic9      2846
post_2010_years:Topic10     3155
post_2010_years:Topic11     2910
post_2010_years:Topic12     2940
post_2010_years:Topic13     2476
post_2010_years:Topic14     2925
post_2010_years:Topic15     2675

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
shape     2.59      0.17     2.26     2.94 1.00     5488     2770

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

Posterior Predictive Checks

Piecewise Contrasts Between Topics

Code
topics <- sort(as.integer(str_extract(
  grep("^Topic\\d+$", rownames(coefs), value = TRUE), "\\d+"
)))
var_order <- c("Intercept", "year_c", "post_2010", "post_2010_years",
  paste0(rep(c("", "year_c:", "post_2010:", "post_2010_years:"), times = length(topics)),
         "Topic", rep(topics, each = 4)))

model_new_word_results <- coefs %>%
  as.data.frame() %>%
  tibble::rownames_to_column("Variable") %>%
  mutate(
    IRR = exp(Estimate),
    IRR_Q2.5 = exp(Q2.5),
    IRR_Q97.5 = exp(Q97.5),
    across(where(is.numeric), ~round(., 2)),
    Variable = factor(Variable, levels = var_order)
  ) %>%
  arrange(Variable) %>%
  mutate(Variable = as.character(Variable))

model_new_word_results_apa <- model_new_word_results %>%
  mutate(
    `Est (SD)` = sprintf("%.2f (%.2f)", Estimate, Est.Error),
    `95% CI` = sprintf("[%.2f, %.2f]", Q2.5, Q97.5),
    `IRR` = sprintf("%.2f", IRR),
    `IRR 95% CI` = sprintf("[%.2f, %.2f]", IRR_Q2.5, IRR_Q97.5),
    Sig = ifelse(Q2.5 > 0 | Q97.5 < 0, "*", "")
  ) %>%
  select(Variable, `Est (SD)`, `95% CI`, IRR, `IRR 95% CI`, Sig)

save_as_docx(flextable(model_new_word_results_apa),
             path = "evolution/model_new_word_results_apa.docx")

Piecewise Regressions by Topic

Code
topics_other <- setdiff(levels(new_word_count_data$Topic), "4")
effects <- c("Intercept", "year_c", "post_2010", "post_2010_years")

hyp_strings <- c(
  # Topic 4 (reference) — main effects are the total effects
  "Intercept = 0",
  "year_c = 0",
  "post_2010 = 0",
  "post_2010_years = 0",
  # Other topics — main + interaction = total effect
  paste0("Intercept + Topic", topics_other, " = 0"),
  paste0("year_c + year_c:Topic", topics_other, " = 0"),
  paste0("post_2010 + post_2010:Topic", topics_other, " = 0"),
  paste0("post_2010_years + post_2010_years:Topic", topics_other, " = 0")
)

hyp <- hypothesis(model_new_word, hyp_strings)

n_other <- length(topics_other)
topic_vec <- c(rep("4", 4), rep(topics_other, 4))
effect_vec <- c(effects, rep(effects, each = n_other))

per_topic_effects <- hyp$hypothesis %>%
  mutate(
    Topic = topic_vec,
    Variable = effect_vec,
    IRR = exp(Estimate),
    IRR_lower = exp(CI.Lower),
    IRR_upper = exp(CI.Upper),
    Sig = ifelse(CI.Lower > 0 | CI.Upper < 0, "*", "")
  ) %>%
  left_join(topic_labels_df, by = "Topic") %>%
  left_join(topic_starts, by = "Topic") %>%
  mutate(
    `Topic` = paste0(Topic, ": ", CustomLabel),
    `Est (SD)` = sprintf("%.2f (%.2f)", Estimate, Est.Error),
    `95% CI` = sprintf("[%.2f, %.2f]", CI.Lower, CI.Upper),
    `IRR` = sprintf("%.2f", IRR),
    `IRR 95% CI` = sprintf("[%.2f, %.2f]", IRR_lower, IRR_upper)
  ) %>%
  select(Topic, First_Year, Variable, `Est (SD)`, `95% CI`, IRR, `IRR 95% CI`, Sig) %>%
  arrange(as.integer(str_extract(Topic, "^\\d+")), factor(Variable, levels = effects))

save_as_docx(flextable(per_topic_effects),
             path = "evolution/per_topic_effects.docx")

Predicted New Word Trajectories

Code
predData <- expand.grid(
  Year = seq(min(new_word_count_df$Year), max(new_word_count_df$Year), by = 1),
  Topic = unique(new_word_count_df$Topic)
)

predData <- predData %>%
  left_join(topic_starts %>% mutate(First_Year = as.integer(First_Year)), by = "Topic") %>%
  mutate(
    year_c = Year - First_Year,
    post_2010 = as.integer(Year >= 2010),
    post_2010_years = (Year - 2010) * post_2010
  ) %>%
  select(-First_Year)

pred <- predict(model_new_word, newdata = predData, probs = c(0.025, 0.975))
predData$pred <- pred[, "Estimate"]
predData$lower <- pred[, "Q2.5"]
predData$upper <- pred[, "Q97.5"]

new_word_gg <- ggplot() +
  geom_point(data = new_word_count_df, 
             aes(x = Year, y = Count, group = Topic),
             size = 1, alpha = 0.4) +
  geom_line(data = predData,
            aes(x = Year, y = pred, group = Topic),
            color = "blue", linewidth = 0.5, alpha = 0.7) +
  geom_ribbon(data = predData,
              aes(x = Year, ymin = lower, ymax = upper, group = Topic),
              fill = "blue", alpha = 0.1) +
  labs(
    x = "Year", 
    y = "New Word Count"
  ) +
  facet_wrap(~ Topic, ncol = 3, scales = "free_y",
             labeller = labeller(Topic = function(x) paste("Topic", x))) +
  scale_x_continuous(
    breaks = seq(1970, 2020, by = 20),
    limits = c(1970, 2025),
    expand = expansion(0, 0)
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "gray90"),
    panel.grid.major.y = element_line(color = "gray90"),
    strip.text         = element_text(size = 12),
    axis.line.x        = element_line(color = "gray90"),
    axis.line.y        = element_line(color = "gray90"),
    axis.ticks.length  = unit(0.2, "cm"),
    axis.text.x        = element_text(size = 12),
    axis.text.y        = element_text(size = 12),
    axis.title.x       = element_text(
      size   = 12, 
      margin = margin(t = 5)   
    ),
    axis.title.y       = element_text(
      size   = 12, 
      margin = margin(r = 12),  
      angle  = 90, 
      vjust  = 0.5
    ),
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    plot.caption = element_text(size = 10, color = "gray40"),
    legend.position = "bottom"
  )