TextAnalysisR Vignette • TextAnalysisR

TextAnalysisR provides a supporting workflow tool for text mining analysis. The web app incorporates quanteda (text preprocessing), stm (structural topic modeling), ggraph, and widyr (network analysis). tidytext is implemented to tidy non-tidy format objects. The R Shiny web app is available at TextAnalysisR::TextAnalysisR.app() or https://textanalysisr.org. Functions are provided for completing word-topic probabilities, document-topic probabilities, estimated effects of covariates on topic prevalence, and network analysis, similar to tasks available in the web app.

These steps are similar to those demonstrated in the Shiny web app at TextAnalysisR::TextAnalysisR.app().

Install the TextAnalysisR Package

The development version can be installed from GitHub:

install.packages("devtools")
devtools::install_github("mshin77/TextAnalysisR")

Load the TextAnalysisR Library

library(TextAnalysisR)

Alternatively, Launch and Browse the Shiny App

if (interactive()) {
  TextAnalysisR.app()
}

Process Files

Choose the Dataset

Upload an example dataset

mydata <- TextAnalysisR::process_files(dataset_choice = "Upload an Example Dataset")

head(mydata)

## # A tibble: 6 × 6
##   reference_type  author                             year title keyword abstract
##   <chr>           <chr>                             <dbl> <chr> <chr>   <chr>   
## 1 journal_article Block, G. H.                       1980 Dysc… Arithm… Notes t…
## 2 thesis          Bukatman, K. L.                    1981 The … locus … This st…
## 3 journal_article Watkins, M. W., & Webb, C.         1981 Comp… Comput… Results…
## 4 journal_article Chaffin, J. D.                     1982 Arc-… Comput… The Arc…
## 5 journal_article Chaffin, J. D., Maxwell, B., & T…  1982 ARC-… Electr… This ar…
## 6 thesis          Golden, C. K.                      1982 The … NA      The pur…

Upload your file

file_info <- data.frame(filepath = "inst/extdata/SpecialEduTech.xlsx")
mydata <- TextAnalysisR::process_files(dataset_choice = "Upload Your File",
                                       file_info = file_info)
head(mydata)

## # A tibble: 6 × 6
##   reference_type  author                             year title keyword abstract
##   <chr>           <chr>                             <dbl> <chr> <chr>   <chr>   
## 1 journal_article Block, G. H.                       1980 Dysc… Arithm… Notes t…
## 2 thesis          Bukatman, K. L.                    1981 The … locus … This st…
## 3 journal_article Watkins, M. W., & Webb, C.         1981 Comp… Comput… Results…
## 4 journal_article Chaffin, J. D.                     1982 Arc-… Comput… The Arc…
## 5 journal_article Chaffin, J. D., Maxwell, B., & T…  1982 ARC-… Electr… This ar…
## 6 thesis          Golden, C. K.                      1982 The … NA      The pur…

Copy and paste text

text_input <- paste0("The purpose of this study was to conduct a content analysis of ",
                     "research on technology use.")
mydata <- TextAnalysisR::process_files(dataset_choice = "Copy and Paste Text",
                                       text_input = text_input)
mydata

## # A tibble: 1 × 1
##   text                                                                          
##   <chr>                                                                         
## 1 The purpose of this study was to conduct a content analysis of research on te…

Preprocess Text Data

Unite Text Columns

df <- TextAnalysisR::SpecialEduTech

united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))

united_tbl

## # A tibble: 490 × 7
##    united_texts               reference_type author  year title keyword abstract
##    <chr>                      <chr>          <chr>  <dbl> <chr> <chr>   <chr>   
##  1 Dyscalculia and the minic… journal_artic… Block…  1980 Dysc… Arithm… Notes t…
##  2 The effects of computer-a… thesis         Bukat…  1981 The … locus … This st…
##  3 Computer Assisted Instruc… journal_artic… Watki…  1981 Comp… Comput… Results…
##  4 Arc-Ed Curriculum: Applic… journal_artic… Chaff…  1982 Arc-… Comput… The Arc…
##  5 ARC-ED curriculum: the ap… journal_artic… Chaff…  1982 ARC-… Electr… This ar…
##  6 The Effect of the Hand-he… thesis         Golde…  1982 The … NA      The pur…
##  7 A review of some traditio… journal_artic… Neal,…  1982 A re… tradit… Discuss…
##  8 A study of the effectiven… thesis         Engle…  1983 A st… microc… The pur…
##  9 The influence of computer… thesis         Foste…  1983 The … comput… The eff…
## 10 Using Computer Software t… journal_artic… Pomme…  1983 Usin… Comput… The art…
## # ℹ 480 more rows

Segment a Corpus Into Tokens

tokens <- TextAnalysisR::preprocess_texts(united_tbl,
                                          text_field = "united_texts",
                                          min_char = 2,
                                          remove_punct = TRUE,
                                          remove_symbols = TRUE,
                                          remove_numbers = TRUE,
                                          remove_url = TRUE,
                                          remove_separators = TRUE,
                                          split_hyphens = TRUE,
                                          split_tags = TRUE,
                                          include_docvars = TRUE,
                                          keep_acronyms = FALSE,
                                          padding = FALSE,
                                          verbose = FALSE)

Detect Multi-Word Expressions

collocations <- TextAnalysisR::detect_multi_word_expressions(tokens, size = 2:5, 
                                                             min_count = 2)

head(collocations)

## [1] "students with"         "learning disabilities" "this study"           
## [4] "problem solving"       "with disabilities"     "assisted instruction"

Process Tokens With Compound Words

library(quanteda)
custom_dict <- quanteda::dictionary(list(custom = c("learning disabilities", "problem solving", "assisted instruction")))

toks_compound <- quanteda::tokens_compound(
  tokens,
  pattern = custom_dict,
  concatenator = "_",
  valuetype = "glob",
  window = 0,
  case_insensitive = TRUE,
  join = TRUE,
  keep_unigrams = FALSE,
  verbose = TRUE
)

Plot Word Frequency

dfm_object_init <- quanteda::dfm(toks_compound)

TextAnalysisR::plot_word_frequency(dfm_object_init, n = 20)

Word Frequency Plot

Remove Predefined Stopwords

stopwords <- stopwords::stopwords("en", source = "snowball")

toks_removed <- quanteda::tokens_remove(toks_compound, pattern = stopwords, verbose = FALSE)

dfm_init <- quanteda::dfm(toks_removed)

TextAnalysisR::plot_word_frequency(dfm_init, n = 20)

Word Frequency Plot After Removing Stopwords

Remove Common Words in the Dataset

common_words <- c("study", "based", "learning", "students", "research", "results")

toks_removed_common <- quanteda::tokens_remove(toks_removed, pattern = common_words, verbose = FALSE)

dfm_init_updated <- quanteda::dfm(toks_removed_common)

TextAnalysisR::plot_word_frequency(dfm_init_updated, n = 20)

Word Frequency Plot After Removing Common Words

Lemmatize Tokens

library(spacyr)
texts <- sapply(toks_removed_common, paste, collapse = " ")

parsed <- spacyr::spacy_parse(x = texts, lemma = TRUE, entity = FALSE, pos = FALSE)

toks_lemmatized <- quanteda::as.tokens(parsed, use_lemma = TRUE)

dfm_object <- quanteda::dfm(toks_lemmatized)

quanteda::docvars(dfm_object) <- quanteda::docvars(dfm_init_updated)

TextAnalysisR::plot_word_frequency(dfm_object, n = 20)

Word Frequency Plot After Lemmatizing Tokens

Word Analysis

Analyze and Visualize Word Co-occurrence Networks

Word co-occurrence network plot

word_co_occurrence_network_results <- 
    TextAnalysisR::word_co_occurrence_network(
        dfm_object,
        doc_var = "reference_type",
        co_occur_n = 100,
        top_node_n = 30,
        nrows = 1,
        height = 1200,
        width = 800)

## [1] "doc_var has 2 levels: journal_article, thesis"
## [1] "Processing group level: journal_article"
## [1] "Processing group level: thesis"

## [1] "Generating table for level: journal_article"
## [1] "Generating table for level: thesis"

## [1] "Generating summary for level: journal_article"
## [1] "Generating summary for level: thesis"

word_co_occurrence_network_results$plot

Word co-occurrence network centrality table

word_co_occurrence_network_results$table

journal_article

Word co-occurrence network summary

word_co_occurrence_network_results$summary

journal_article

Analyze and Visualize Word Correlation Networks

Word correlation network plot

word_correlation_network_results <- 
  TextAnalysisR::word_correlation_network(
    dfm_object,
    doc_var = "reference_type",
    common_term_n = 30,
    corr_n = 0.42,
    top_node_n = 40,
    nrows = 2,
    height = 1400,
    width = 800)

## [1] "doc_var has 2 levels: journal_article, thesis"
## [1] "Processing group level: journal_article"
## [1] "Processing group level: thesis"
## [1] "Generating table for level: journal_article"
## [1] "Generating table for level: thesis"
## [1] "Generating summary for level: journal_article"
## [1] "Generating summary for level: thesis"

word_correlation_network_results$plot

Word correlation network centrality table

word_correlation_network_results$table

journal_article

thesis

Word correlation network summary

word_correlation_network_results$summary

journal_article

thesis

Analyze and Visualize Word Frequency Distributions Across a Continuous Variable

Word frequency distribution plot

word_frequency_distribution_results <- TextAnalysisR::word_frequency_distribution(
  dfm_object,
  continuous_variable = "year",
  selected_terms = c("calculator", "computer"),
  height = 500,
  width = 800)

word_frequency_distribution_results$plot

Word frequency distribution table

word_frequency_distribution_results$table

calculator

computer

Structural Topic Modeling

Evaluate Optimal Number of Topics

TextAnalysisR::evaluate_optimal_topic_number(
  dfm_object = dfm_object,
  topic_range = 5:30,
  max.em.its = 75,
  categorical_var = "reference_type",
  continuous_var = "year",
  height = 600,
  width = 800,
  verbose = FALSE)

Select Top Terms for Each Topic

out <- quanteda::convert(dfm_object, to = "stm")

library(stm)
stm_15 <- stm(
  data = out$meta,
  documents = out$documents,
  vocab = out$vocab,
  max.em.its = 75,
  init.type = "Spectral",
  K = 15,
  prevalence = ~ reference_type + s(year),
  verbose = FALSE)

top_topic_terms <- TextAnalysisR::select_top_topic_terms(
  stm_model = stm_15,
  top_term_n = 10,
  verbose = FALSE
)

library(dplyr)
top_topic_terms %>%
  mutate_if(is.numeric, ~ round(., 3)) %>%
      DT::datatable(
      rownames = FALSE,
      extensions = 'Buttons',
      options = list(
        scrollX = TRUE,
        scrollY = "400px",
        width = "80%",
        dom = 'Bfrtip',
        buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
      )
    ) %>%
    DT::formatStyle(
      columns = c("topic", "term", "beta"),
      fontSize = '16px'
    )

Generate Topic Labels Using OpenAI’s API

Directly input the Open AI API key

top_labeled_topic_terms <- TextAnalysisR::generate_topic_labels(
  top_topic_terms,
  model = "gpt-3.5-turbo",
  temperature = 0.5,
  openai_api_key = "your_openai_api_key",
  verbose = FALSE)

top_labeled_topic_terms

Load the Open AI API key from the .env file in the working directory

top_labeled_topic_terms <- TextAnalysisR::generate_topic_labels(
  top_topic_terms,
  model = "gpt-3.5-turbo",
  temperature = 0.5,
  verbose = FALSE)

top_labeled_topic_terms %>%
  mutate_if(is.numeric, ~ round(., 3)) %>%
      DT::datatable(
      rownames = FALSE,
      extensions = 'Buttons',
      options = list(
        scrollX = TRUE,
        scrollY = "400px",
        width = "80%",
        dom = 'Bfrtip',
        buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
      )
    ) %>%
    DT::formatStyle(
      columns = c("topic", "topic_label", "term", "beta"),
      fontSize = '16px'
    )

Plot Highest Word Probabilities for Each Topic

Add generated topic lables to the top topic terms

word_probability_plot <- TextAnalysisR::word_probability_plot(
  top_labeled_topic_terms,
  topic_label = "topic_label",
  ncol = 1,
  height = 3500,
  width = 800)

word_probability_plot

Use non-labeled topics

word_probability_plot <- TextAnalysisR::word_probability_plot(
  top_topic_terms,
  ncol = 2,
  height = 2000,
  width = 800)

word_probability_plot

Manually label topics

manual_labels <- c("1" = "CAI for math problem solving",
                   "2" = "STEM technology",
                   "3" = "Use of manipulatives")

word_probability_plot <- TextAnalysisR::word_probability_plot(
  top_topic_terms,
  topic_label = manual_labels,
  ncol = 2,
  height = 2000,
  width = 850)

word_probability_plot

Per-Document Per-Topic Probabilities

Plot mean topic prevalence across documents

topic_probability_plot <- TextAnalysisR::topic_probability_plot(
  stm_model= stm_15,
  top_n = 10,
  height = 500,
  width = 800,
  verbose = TRUE)

topic_probability_plot

create a table of mean topic prevalence across documents

topic_probability_table <- TextAnalysisR::topic_probability_table(
  stm_model= stm_15,
  top_n = 10,
  verbose = TRUE)

topic_probability_table