The TextAnalysisR
package provides a supporting workflow
for text mining analysis. The TextAnalysisR.app()
function
allows users to launch and browse a Shiny app. This web app incorporates
‘quanteda’ (text preprocessing), ‘stm’ (structural topic modeling),
‘ggraph’ as well as ‘widyr’ (network analysis). ‘tidytext’ was
implemented to tidy non-tidy format objects.
These steps are similar to those demonstrated in the Shiny
web app at TextAnalysisR::TextAnalysisR.app()
.
Installation
The development version can be installed from GitHub:
install.packages("devtools")
devtools::install_github("mshin77/TextAnalysisR")
Launch and Browse the Shiny app
library(TextAnalysisR)
if (interactive()) {
TextAnalysisR.app()
}
Unite Text Columns
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
united_tbl
## # A tibble: 490 × 7
## united_texts reference_type author year title keyword abstract
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Dyscalculia and the minic… journal_artic… Block… 1980 Dysc… Arithm… Notes t…
## 2 The effects of computer-a… thesis Bukat… 1981 The … locus … This st…
## 3 Computer Assisted Instruc… journal_artic… Watki… 1981 Comp… Comput… Results…
## 4 Arc-Ed Curriculum: Applic… journal_artic… Chaff… 1982 Arc-… Comput… The Arc…
## 5 ARC-ED curriculum: the ap… journal_artic… Chaff… 1982 ARC-… Electr… This ar…
## 6 The Effect of the Hand-he… thesis Golde… 1982 The … NA The pur…
## 7 A review of some traditio… journal_artic… Neal,… 1982 A re… tradit… Discuss…
## 8 A study of the effectiven… thesis Engle… 1983 A st… microc… The pur…
## 9 The influence of computer… thesis Foste… 1983 The … comput… The eff…
## 10 Using Computer Software t… journal_artic… Pomme… 1983 Usin… Comput… The art…
## # ℹ 480 more rows
Preprocess Text Data
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts", verbose = FALSE)
tokens
## Tokens consisting of 490 documents and 6 docvars.
## text1 :
## [1] "dyscalculia" "minicalculator" "alp" "program"
## [5] "arithmetic" "arithmetic" "remedial" "teaching"
## [9] "education" "learning" "disabled" "children"
## [ ... and 71 more ]
##
## text2 :
## [1] "effects" "computer" "assisted" "instruction"
## [5] "mastery" "multiplication" "facts" "learning"
## [9] "disabled" "elementary" "school" "aged"
## [ ... and 54 more ]
##
## text3 :
## [1] "computer" "assisted" "instruction" "learning" "disabled"
## [6] "students" "computer" "assisted" "instruction" "computer"
## [11] "programs" "drills"
## [ ... and 42 more ]
##
## text4 :
## [1] "arc" "ed" "curriculum" "applicability"
## [5] "severely" "handicapped" "pupils" "computer"
## [9] "assisted" "instruction" "games" "online"
## [ ... and 37 more ]
##
## text5 :
## [1] "arc" "ed" "curriculum" "application" "video"
## [6] "game" "formats" "educational" "software" "electronic"
## [11] "games" "curriculum"
## [ ... and 89 more ]
##
## text6 :
## [1] "effect" "hand" "held" "calculator" "mathematics"
## [6] "speed" "accuracy" "motivation" "secondary" "educable"
## [11] "mentally" "retarded"
## [ ... and 223 more ]
##
## [ reached max_ndoc ... 484 more documents ]
Plot Word Frequency
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts")
dfm_object <- quanteda::dfm(tokens)
word_frequency_plot <- TextAnalysisR::plot_word_frequency(dfm_object, n = 20)
word_frequency_plot
Evaluate Optimal Number of Topics
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts")
dfm_object <- quanteda::dfm(tokens)
TextAnalysisR::evaluate_optimal_topic_number(
dfm_object = dfm_object,
topic_range = 5:30,
max.em.its = 75,
categorical_var = "reference_type",
continuous_var = "year",
height = 600,
width = 800,
verbose = FALSE)
Plot Highest Word Probabilities for Each Topic
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts")
dfm_object <- quanteda::dfm(tokens)
TextAnalysisR::plot_word_probabilities(
dfm_object = dfm_object,
topic_n = 15,
max.em.its = 75,
categorical_var = "reference_type",
continuous_var = "year",
top_term_n = 10,
ncol = 3,
height = 1200,
width = 800,
verbose = FALSE)
Plot Mean Topic Prevalence Across Documents
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts")
dfm_object <- quanteda::dfm(tokens)
TextAnalysisR::plot_mean_topic_prevalence(
dfm_object = dfm_object,
topic_n = 15,
max.em.its = 75,
categorical_var = "reference_type",
continuous_var = "year",
top_term_n = 10,
top_topic_n = 15,
height = 500,
width = 1000,
verbose = FALSE)
Plot a Word Co-occurrence Network
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts")
dfm_object <- quanteda::dfm(tokens)
TextAnalysisR::plot_word_co_occurrence_network(
dfm_object,
co_occur_n = 200,
height = 900,
width = 800)
Plot a Word Correlation Network
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts")
dfm_object <- quanteda::dfm(tokens)
TextAnalysisR::plot_word_correlation_network(
dfm_object,
co_occur_n = 30,
corr_n = 0.4,
height = 900,
width = 800)
Plot Word Frequency Trends Over Time
df <- TextAnalysisR::SpecialEduTech
united_tbl <- TextAnalysisR::unite_text_cols(df, listed_vars = c("title", "keyword", "abstract"))
tokens <- TextAnalysisR::preprocess_texts(united_tbl, text_field = "united_texts")
dfm_object <- quanteda::dfm(tokens)
stm_15 <- TextAnalysisR::stm_15
TextAnalysisR::word_frequency_trends(dfm_object,
stm_model = stm_15,
time_variable = "year",
selected_terms = c("calculator", "computer"),
height = 500,
width = 1000)