Skip to contents

The TextAnalysisR package provides a supporting workflow for text mining analysis. The TextAnalysisR.app() function allows users to launch and browse a Shiny app. This web app incorporates ‘quanteda’ (text preprocessing), ‘stm’ (structural topic modeling), ‘ggraph’ as well as ‘widyr’ (network analysis). ‘tidytext’ was implemented to tidy non-tidy format objects.

Installation

The development version from GitHub with:

install.packages("devtools")
devtools::install_github("mshin77/TextAnalysisR")

Launch and Browse the Shiny app

Preprocess Text Data

data <- TextAnalysisR::SpecialEduTech 

preprocessed_data <- preprocess_texts(data, text_field = "abstract")

Plot Word Frequency

# data is a document-feature matrix (dfm) object through the quanteda package.
# Plot word frequency for the top 20 terms.

dfm <- SpecialEduTech %>%
  preprocess_texts(text_field = "abstract") %>%
  quanteda::dfm()

dfm %>% plot_word_frequency(n = 20)

Examine Highest Per-Term Per-Topic Probabilities

# data is a tidy data frame that includes per-term per-topic probabilities (beta).
# Examine the top 5 terms with the highest per-term per-topic probabilities.
# Number of top_n can be changed.

dfm <- SpecialEduTech %>%
  preprocess_texts(text_field = "abstract") %>%
  quanteda::dfm()

data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)

data %>% examine_top_terms(top_n = 5) %>%
  dplyr::mutate_if(is.numeric, ~ round(., 3)) %>%
  DT::datatable(rownames = FALSE)

Plot Topic Per-Term Per-Topic Probabilities

# data is a tidy data frame that includes per-term per-topic probabilities (beta).
# Plot per-term per-topic probabilities for the top 10 terms.

dfm <- SpecialEduTech %>%
  preprocess_texts(text_field = "abstract") %>%
  quanteda::dfm()

data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)
data %>% examine_top_terms(top_n = 2) %>%
  plot_topic_term(ncol = 3)

Plot Per-Document Per-Topic Probabilities

# data is a tidy data frame that includes per-document per-topic probabilities (gamma).
# Plot per-document per-topic probabilities for the top 15 topics.
# Number of top_n can be changed.

dfm <- SpecialEduTech %>%
  preprocess_texts(text_field = "abstract") %>%
  quanteda::dfm()

data <- tidytext::tidy(stm_15, matrix = "gamma", document_names = rownames(dfm), log = FALSE)

data %>% topic_probability_plot(top_n = 15) %>% plotly::ggplotly()

Visualize a Table for Per-Document Per-Topic Probabilities

# data is a tidy data frame that includes per-document per-topic probabilities (gamma).
# Create a table of per-document per-topic probabilities for the top 15 topics.
# Number of top_n can be changed.

dfm <- SpecialEduTech %>%
  preprocess_texts(text_field = "abstract") %>%
  quanteda::dfm()

data <- tidytext::tidy(stm_15, matrix = "gamma", document_names = rownames(dfm), log = FALSE)

data %>% topic_probability_table(top_n = 15) %>% DT::datatable(rownames = FALSE)