# install.packages(c("readxl", "magrittr", "dplyr", "tidyr", "stringr", "tidyverse", "tidygraph", "igraph", "ggraph", "widyr", "ggplot2", "stm"))
# install.packages('devtools')
# devtools::install_github('haven-jeon/KoNLP')
# devtools::install_github("mshin77/TextAnalysisR")
최근 교육 현장 변화에 따른 미래 학습장애 학생 교육 지원 정책 개발을 위한 인식 분석
Analysis of Perceptions for the Development of Future Education Policies for Students with Learning Disabilities in Accordance With Recent Changes in the Education Field
강은영, 박해린, 신미경(2023) 논문에서 활용된 데이터와 R 코드는 오픈데이터저장소(https://osf.io/v3urc)에서도 다운로드 받을 수 있습니다.
1.1. R 패키지 설치
1.2. R 패키지 불러오기
suppressPackageStartupMessages({
library(readxl)
library(magrittr)
library(dplyr)
library(tidyr)
library(stringr)
library(KoNLP)
library(tidytext)
library(tidyverse)
library(DT)
library(tidygraph)
library(igraph)
library(ggraph)
library(widyr)
library(ggplot2)
library(TextAnalysisR)
library(quanteda)
library(stm)
})
2.1. 엑셀 파일 열기
<- read_excel("data/ld_policy_riss.xlsx")
data_riss <- data_riss %>%
data_riss mutate(document = paste(저자, "(", 발행연도, ")", sep = "")) %>%
select(document, everything())
<- data_riss %>% filter(policy_topic == "6차특수교육발전계획")
sixth_sped_riss <- data_riss %>% filter(policy_topic == "기초학력지원센터_특수교육지원센터")
support_center_riss <- data_riss %>% filter(policy_topic == "학교교육_학교밖_교육연계") out_of_school_riss
2.2. 텍스트 열 선택
Code
<- sixth_sped_riss %>%
sixth_sped_riss_text select("제목", "주제어", "국문 초록 (Abstract)") %>%
unite(
col = combined_text,
sep = " ",
remove = FALSE
)
<- left_join(sixth_sped_riss, sixth_sped_riss_text, by = "제목") %>%
sixth_sped_riss_data select(-ends_with(".y")) %>%
rename_with(~ sub("\\.x$", "", .), ends_with(".x"))
<- support_center_riss %>%
support_center_riss_text select("제목", "주제어", "국문 초록 (Abstract)") %>%
unite(
col = combined_text,
sep = " ",
remove = FALSE
)
<- left_join(support_center_riss, support_center_riss_text, by = "제목") %>%
support_center_riss_data select(-ends_with(".y")) %>%
rename_with(~ sub("\\.x$", "", .), ends_with(".x"))
<- out_of_school_riss %>%
out_of_school_riss_text select("제목", "주제어", "국문 초록 (Abstract)") %>%
unite(
col = combined_text,
sep = " ",
remove = FALSE
)
<- left_join(out_of_school_riss, out_of_school_riss_text, by = "제목") %>%
out_of_school_riss_data select(-ends_with(".y")) %>%
rename_with(~ sub("\\.x$", "", .), ends_with(".x"))
2.3. 단어 빈도-역문서 빈도 분석에 필요한 함수 정의
Code
# 불필요한 패턴 제거 함수
# This function is to remove specific patterns from a text column.
#
# @param {data} The input dataframe.
# @param {text_col} The name of the text column in the dataframe.
# @param {...} Additional arguments to be passed to the function.
# @return The cleaned dataframe with an additional processed_text column.
<- function(data, text_col, ...) {
rm_patterns <- "(HTTP(S)?://)?([A-Z0-9]+(-?[A-Z0-9])*\\.)+[A-Z0-9]{2,}(/\\S*)?"
homepage <- "(19|20)?\\d{2}[-/.][0-3]?\\d[-/.][0-3]?\\d"
date <- "0\\d{1,2}[- ]?\\d{2,4}[- ]?\\d{4}"
phone <- "[A-Z0-9.-]+@[A-Z0-9.-]+"
email <- "[ㄱ-ㅎㅏ-ㅣ]+"
hangule <- "[:punct:]"
punctuation <- "[^가-힣A-Z0-9]"
text_p
<- data %>%
cleaned_data mutate(
processed_text = !!sym(text_col) %>%
str_remove_all(homepage) %>%
str_remove_all(date) %>%
str_remove_all(phone) %>%
str_remove_all(email) %>%
str_remove_all(hangule) %>%
str_replace_all(punctuation, " ") %>%
str_replace_all(text_p, " ") %>%
str_squish()
)
return(cleaned_data)
}
# 용언과 체언 추출 함수
# This function is to extract morphemes (noun and verb families).
#
# @param {data} The input data to be processed.
# @param {text_col} The name of the text column in the dataframe.
# @returns {pos_td} The processed data containing the extracted nouns and verbs.
<- function(data, text_col) {
extract_pos <- data %>%
pos_init unnest_tokens(pos_init, text_col, token = SimplePos09) %>%
mutate(pos_n = row_number())
<- pos_init %>%
noun filter(str_detect(pos_init, "/n")) %>%
mutate(pos = str_remove(pos_init, "/.*$"))
<- pos_init %>%
verb filter(str_detect(pos_init, "/p")) %>%
mutate(pos = str_replace_all(pos_init, "/.*$", "다"))
<- bind_rows(noun, verb) %>%
pos_td arrange(pos_n) %>%
filter(nchar(pos) > 1) %>%
tibble()
return(pos_td)
}
# 단어 빈도-역문서 빈도 계산 함수
# Calculates the term frequency-inverse document frequency (TF-IDF) for a given dataset.
#
# @param data The dataset to calculate TF-IDF on.
# @param pos The column name representing the term positions.
# @param document The column name representing the documents.
# @param ... Additional arguments to be passed to the function.
# @return A datatable displaying the TF-IDF values, sorted in descending order.
<- function(data, pos, document, ...) {
calculate_tf_idf <- data %>%
tf_idf bind_tf_idf(pos, document, n) %>%
arrange(desc(tf_idf)) %>%
mutate_if(is.numeric, ~ round(., 3))
<- tf_idf %>%
tf_idf_dt datatable(options = list(
pageLength = 5,
initComplete = JS(
"
function(settings, json) {
$(this.api().table().header()).css({
'font-family': 'Arial, sans-serif',
'font-size': '16px',
});
}
"
)%>%
)) formatStyle(columns = colnames(.$x$data), `font-size` = "15px")
return(tf_idf_dt)
}
2.4. 6차특수교육발전계획 단어 빈도-역문서 빈도
Code
%>%
sixth_sped_riss_data rm_patterns(text_col = "combined_text") %>%
extract_pos(text_col = "processed_text") %>%
count(document, pos, sort = TRUE) %>%
calculate_tf_idf(term = "pos", document = "document")
2.5. 기초학력지원센터 특수교육지원센터 단어 빈도-역문서 빈도
Code
%>%
support_center_riss_data rm_patterns(text_col = "combined_text") %>%
extract_pos(text_col = "processed_text") %>%
count(document, pos, sort = TRUE) %>%
calculate_tf_idf(term = "pos", document = "document")
2.6. 학교교육 학교밖 교육연계 단어 빈도-역문서 빈도
Code
%>%
out_of_school_riss_data rm_patterns(text_col = "combined_text") %>%
extract_pos(text_col = "processed_text") %>%
count(document, pos, sort = TRUE) %>%
calculate_tf_idf(term = "pos", document = "document")
3.1. 데이터 준비
<- read_excel("data/ld_policy_news.xlsx")
data_news <- data_news %>%
data_news mutate(document = paste0("text", row.names(.))) %>%
select(document, everything())
<- data_news %>% filter(policy_topic == "6차특수교육발전계획")
sixth_sped_news <- data_news %>% filter(policy_topic == "기초학력지원센터_특수교육지원센터")
support_center_news <- data_news %>% filter(policy_topic == "학교교육_학교밖_교육연계")
out_of_school_news
# 중복 기사 확인 및 삭제
# sixth_sped_news %>% duplicated() %>% sum()
%<>% filter(!duplicated(.))
sixth_sped_news
# support_center_news %>% duplicated() %>% sum()
%<>% filter(!duplicated(.))
support_center_news
# out_of_school_news %>% duplicated() %>% sum()
%<>% filter(!duplicated(.))
out_of_school_news
# 50자 이상의 뉴스 기사 선택 및 문자 길이 범위 확인
<- sixth_sped_news %>% filter(nchar(sixth_sped_news$article) >= 50)
sixth_sped_news_filtered <- support_center_news %>% filter(nchar(support_center_news$article) >= 50)
support_center_news_filtered <- out_of_school_news %>% filter(nchar(out_of_school_news$article) >= 50) out_of_school_news_filtered
3.2. 단어 네트워크 분석에 필요한 함수 정의
Code
# 단어 간 쌍별 상관관계 계산 함수
# Calculates pairwise correlation between data based on specified criteria.
#
# @param {data} The input data frame.
# @param {pos} The column name for grouping the data.
# @param {pattern} The pattern to match in the item1 column.
# @param {document} The column name for calculating correlation.
# @param {cor_value} The minimum correlation value to consider.
# @param {count} The minimum count of observations per group.
# @param {...} Additional arguments to be passed to the pairwise_cor function.
# @return A {filtered_data} frame containing pairwise correlations.
<- function(data, pos, pattern, document, cor_value, count, ...) {
calculate_pairwise_cor <- data %>%
cor group_by(pos) %>%
filter(n() >= count) %>%
pairwise_cor(pos, document, sort = TRUE) %>%
ungroup()
<- cor %>%
filter_meta arrange(desc(correlation)) %>%
filter(grepl(pattern = pattern, item1))
<- filter_meta %>%
filtered_data filter(correlation > cor_value)
return(filtered_data)
}
# 단어 네트워크 레이아웃 함수
# Creates a graph layout based on the provided data.
#
# @param {data} The data used to create the graph.
# @param {layout_type} The type of layout to use (default: "fr").
# @returns {layout_meta} The graph layout.
<- function(data, layout_type = "fr") {
create_graph <- as_tbl_graph(data, directed = TRUE)
graph_meta
<- igraph::vcount(graph_meta)
node_num <- graph_meta %>%
graph_meta_centr mutate(centrality = centrality_degree(mode = "out") / (node_num - 1))
<- create_layout(graph_meta_centr, layout = layout_type)
layout_meta
return(layout_meta)
}
# 단어 네트워크 그래프 함수
# This function takes a data frame and creates a network graph visualization.
#
# @param {data} The input data frame.
# @param {...} Additional arguments to be passed to the function.
# @returns {network_meta_gg} The network graph as a ggplot_gtable object.
<- function(data, ...) {
plot_graph <- data %>%
network_meta ggraph() +
geom_edge_link(aes(
edge_alpha = correlation,
edge_width = correlation
edge_colour = "#56c1f0") +
), geom_node_point(aes(colour = centrality), size = 5) +
geom_node_text(aes(label = name), size = 4, repel = TRUE) +
scale_color_continuous(guide = "legend", high = "#D2B55B", low = "#FCF4A3") +
theme_void(base_size = 6) +
theme(
legend.title = element_text(size = 11),
legend.text = element_text(size = 11),
legend.position = "right"
)
<- ggplot_build(network_meta)
network_meta_g <- ggplot_gtable(network_meta_g)
network_meta_gg
return(network_meta_gg)
}
3.3. 한글 텍스트 전처리
Code
<- sixth_sped_news_filtered %>%
sixth_sped_join rm_patterns(text_col = "article") %>%
extract_pos(text_col = "processed_text")
<- support_center_news_filtered %>%
support_center_join rm_patterns(text_col = "article") %>%
extract_pos(text_col = "processed_text")
<- out_of_school_news_filtered %>%
out_of_school_join rm_patterns(text_col = "article") %>%
extract_pos(text_col = "processed_text")
3.4. 6차특수교육발전계획 단어 네트워크 시각화
Code
set.seed(2023)
<- sixth_sped_join %>%
sixth_sped_corr calculate_pairwise_cor(
pos = "pos",
pattern = "장애",
document = "document",
cor_value = 0.5,
count = 5
)
%>% as_tbl_graph(data, directed = TRUE) sixth_sped_corr
# A tbl_graph: 39 nodes and 39 edges
#
# A directed simple graph with 5 components
#
# Node Data: 39 × 1 (active)
name
<chr>
1 학교장애인식지수
2 지적장애
3 장애학생
4 장애인
5 장애유형
6 지체장애
7 장애
8 비장애
9 진단
10 259개
# ℹ 29 more rows
#
# Edge Data: 39 × 3
from to correlation
<int> <int> <dbl>
1 1 9 0.813
2 2 10 0.747
3 2 11 0.747
# ℹ 36 more rows
Code
<- sixth_sped_corr %>%
sixth_sped_gg create_graph(layout_type = "fr") %>%
plot_graph()
ggsave("figure/sixth_sped_gg.png", width = 8, height = 6)
3.5. 기초학력지원센터 특수교육지원센터 단어 네트워크 시각화
Code
set.seed(2023)
<- support_center_join %>%
support_center_corr calculate_pairwise_cor(
pos = "pos",
pattern = "장애",
document = "document",
cor_value = 0.6,
count = 5
)
%>% as_tbl_graph(data, directed = TRUE) support_center_corr
# A tbl_graph: 52 nodes and 60 edges
#
# A directed simple graph with 3 components
#
# Node Data: 52 × 1 (active)
name
<chr>
1 발달장애학생
2 장애학생들
3 청각장애학생
4 지적장애
5 등교개학
6 모의평가
7 학생평가
8 접속
9 과제물
10 방문교육
# ℹ 42 more rows
#
# Edge Data: 60 × 3
from to correlation
<int> <int> <dbl>
1 1 5 1
2 1 6 1
3 1 7 1
# ℹ 57 more rows
Code
<- support_center_corr %>%
support_center_gg create_graph(layout_type = "fr") %>%
plot_graph()
ggsave("figure/support_center_gg.png", width = 8, height = 6)
3.6. 학교교육 학교밖 교육연계 단어 네트워크 시각화
Code
set.seed(2023)
<- out_of_school_join %>%
out_of_school_corr calculate_pairwise_cor(
pos = "pos",
pattern = "학교밖",
document = "document",
cor_value = 0.35,
count = 4
)
%>% as_tbl_graph(data, directed = TRUE) out_of_school_corr
# A tbl_graph: 53 nodes and 51 edges
#
# A rooted forest with 2 trees
#
# Node Data: 53 × 1 (active)
name
<chr>
1 학교밖청소년
2 학교밖청소년지원센터
3 전라남도교육청
4 자기계발
5 전남도
6 민선3기
7 학교복귀
8 정보연계
9 의무교육단계
10 이하
# ℹ 43 more rows
#
# Edge Data: 51 × 3
from to correlation
<int> <int> <dbl>
1 1 3 0.746
2 1 4 0.660
3 1 5 0.643
# ℹ 48 more rows
Code
<- out_of_school_corr %>%
out_of_school_gg create_graph(layout_type = "fr") %>%
plot_graph()
ggsave("figure/out_of_school_gg.png", width = 8.5, height = 6)
4.1. 엑셀 파일 열기
<- read_excel("data/ld_policy_delphi.xlsx")
data_delphi
<- data_delphi %>% filter(policy_topic == "6차특수교육발전계획")
sixth_sped_delphi <- data_delphi %>% filter(policy_topic == "기초학력지원센터_특수교육지원센터")
support_center_delphi <- data_delphi %>% filter(policy_topic == "학교교육_학교밖_교육연계") out_of_school_delphi
4.2. 한글 텍스트 전처리
Code
<- sixth_sped_delphi %>%
sixth_sped_delphi_data rm_patterns(text_col = "response") %>%
extract_pos(text_col = "processed_text")
<- support_center_delphi %>%
support_center_delphi_data rm_patterns(text_col = "response") %>%
extract_pos(text_col = "processed_text")
<- out_of_school_delphi %>%
out_of_school_delphi_data rm_patterns(text_col = "response") %>%
extract_pos(text_col = "processed_text")
4.3. 6차특수교육발전계획 토픽 모델링
평균 주제 발현율
Code
<- c("좋다", "좋겠습니", "피다", "많다", "있다", "하다", "위하다", "받다", "대하다", "이루다", "같다", "아니다", "되다", "보입니다", "보입니", "같습니", "같습니다", "경우", "통하다", "않다", "못하다", "없다", "지다")
stop_words
$pos <- case_when(
sixth_sped_delphi_data$pos== "학생들" ~ "학생",
sixth_sped_delphi_data$pos== "학부모들이" ~ "학부모",
sixth_sped_delphi_data
TRUE ~ as.character(sixth_sped_delphi_data$pos)
)
<- sixth_sped_delphi_data %>%
sixth_sped_delphi_dfm filter(!pos %in% stop_words) %>%
count(participant, pos, sort = TRUE) %>%
cast_dfm(participant, pos, n)
<- stm(sixth_sped_delphi_dfm, K = 4, init.type = "Spectral", verbose = FALSE)
topic_model_sixth_sped
<- tidy(topic_model_sixth_sped, matrix = "gamma",
gamma_sixth_sped_td document_names = rownames(sixth_sped_delphi_dfm ))
<- gamma_sixth_sped_td %>%
gamma_terms_sixth_sped group_by(topic) %>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma)) %>%
mutate(topic = reorder(topic, gamma))
Code
load("data/gamma_terms.Rdata")
gamma_terms_sixth_sped
# A tibble: 4 × 2
topic gamma
<fct> <dbl>
1 2 0.300
2 4 0.300
3 1 0.200
4 3 0.200
주제별 연관 단어 비율
Code
<- tidy(topic_model_sixth, matrix = "beta")
beta_sixth_sped
<- beta_sixth_sped %>%
beta_sixth_sped_gg group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(
topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)
%>%
) ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~topic, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
theme_minimal()
ggsave("figure/beta_sixth_sped_gg.png", beta_sixth_sped_gg, width = 8.5, height = 7)
4.4. 기초학력지원센터 특수교육지원센터 토픽 모델링
평균 주제 발현율
Code
<- support_center_delphi_data %>%
support_center_delphi_dfm filter(!pos %in% stop_words) %>%
count(participant, pos, sort = TRUE) %>%
cast_dfm(participant, pos, n)
<- stm(support_center_delphi_dfm, K = 3, init.type = "Spectral", verbose = FALSE)
topic_model_support_center
<- tidy(topic_model_support_center, matrix = "gamma",
gamma_support_center_td document_names = rownames(support_center_delphi_dfm))
<- gamma_support_center_td %>%
gamma_terms_support_center group_by(topic) %>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma)) %>%
mutate(topic = reorder(topic, gamma))
Code
gamma_terms_support_center
# A tibble: 3 × 2
topic gamma
<fct> <dbl>
1 1 0.400
2 2 0.300
3 3 0.300
주제별 연관 단어 비율
Code
<- tidy(topic_model_support_center, matrix = "beta")
beta_support_center
<- beta_support_center %>%
beta_support_center_gg group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(
topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)
%>%
) ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~topic, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
theme_minimal()
ggsave("figure/beta_support_center_gg.png", beta_support_center_gg, width = 8.5, height = 5)
4.5. 학교교육 학교밖 교육연계 토픽 모델링
평균 주제 발현율
Code
<- out_of_school_delphi_data %>%
out_of_school_delphi_dfm filter(!pos %in% stop_words) %>%
count(participant, pos, sort = TRUE) %>%
cast_dfm(participant, pos, n)
<- stm(out_of_school_delphi_dfm, K = 3, init.type = "Spectral", verbose = FALSE)
topic_model_out_of_school
<- tidy(topic_model_out_of_school, matrix = "gamma",
gamma_out_of_school_td document_names = rownames(out_of_school_delphi_dfm))
<- gamma_out_of_school_td %>%
gamma_terms_out_of_school group_by(topic) %>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma)) %>%
mutate(topic = reorder(topic, gamma))
Code
gamma_terms_out_of_school
# A tibble: 3 × 2
topic gamma
<fct> <dbl>
1 3 0.499
2 1 0.300
3 2 0.200
주제별 연관 단어 비율
Code
<- tidy(topic_model_out_of_school, matrix = "beta")
beta_out_of_school
<- beta_out_of_school %>%
beta_out_of_school_gg group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
mutate(
topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)
%>%
) ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~topic, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
theme_minimal()
ggsave("figure/beta_out_of_school_gg.png", beta_out_of_school_gg, width = 8.5, height = 5)