📚 今日目標
- 掌握文本預處理基本技術
- 學習詞頻分析和TF-IDF
- 掌握情感分析方法
- 學習主題建模(LDA)
- 實踐文本分類
📝 第一部分:文本預處理
1.1 安裝必要的包
# 安裝文本挖掘相關包
install.packages(c("tm", "tidytext", "text2vec", "quanteda",
"topicmodels", "wordcloud", "syuzhet",
"textstem", "stopwords", "textdata"))
# 安裝tidyverse用於數據處理
install.packages("tidyverse")
# 加載包
library(tm) # 文本挖掘
library(tidytext) # tidytext方法
library(tidyverse) # 數據處理
library(wordcloud) # 詞雲
library(text2vec) # 文本向量化
library(quanteda) # 量化文本分析
library(topicmodels) # 主題模型
1.2 文本數據加載
# 創建示例文本數據
text_data <- data.frame(
id = 1:10,
text = c(
"我喜歡學習R語言,因為它功能強大。",
"數據分析是一項很有趣的工作。",
"機器學習可以幫助我們做出更好的預測。",
"文本挖掘是從文本中提取有用信息的技術。",
"自然語言處理是人工智能的重要分支。",
"深度學習在圖像識別方面表現出色。",
"統計學是數據科學的基礎。",
"數據可視化能讓數據更直觀易懂。",
"大數據技術正在改變世界。",
"R語言和Python都是優秀的數據科學工具。"
),
category = c("編程", "數據分析", "機器學習", "文本挖掘",
"人工智能", "深度學習", "統計學", "可視化",
"大數據", "數據科學"),
stringsAsFactors = FALSE
)
# 查看數據
head(text_data)
# 從文件讀取文本數據
# 讀取CSV文件
# text_from_csv <- read.csv("text_data.csv", stringsAsFactors = FALSE)
# 讀取TXT文件
# text_from_txt <- readLines("text_data.txt", encoding = "UTF-8")
# 讀取PDF文件(需要pdftools包)
# install.packages("pdftools")
# library(pdftools)
# text_from_pdf <- pdf_text("document.pdf")
# 讀取Word文檔(需要textreadr包)
# install.packages("textreadr")
# library(textreadr)
# text_from_docx <- read_docx("document.docx")
1.3 文本預處理函數
# 創建文本預處理函數
clean_text <- function(text) {
# 轉換為小寫
text <- tolower(text)
# 移除標點符號
text <- gsub("[[:punct:]]", " ", text)
# 移除數字
text <- gsub("[[:digit:]]", " ", text)
# 移除多餘空格
text <- gsub("\\s+", " ", text)
# 移除首尾空格
text <- trimws(text)
return(text)
}
# 應用清理函數
text_data$clean_text <- sapply(text_data$text, clean_text)
# 查看清理結果
cat("原始文本:", text_data$text[1], "\n")
cat("清理後:", text_data$clean_text[1], "\n")
# 更高級的預處理函數
advanced_text_cleaning <- function(text) {
# 移除URL
text <- gsub("https?://\\S+|www\\.\\S+", "", text)
# 移除HTML標籤
text <- gsub("<.*?>", "", text)
# 移除特殊字符
text <- gsub("[^\u4e00-\u9fa5a-zA-Z0-9\\s]", "", text, perl = TRUE)
# 轉換為小寫
text <- tolower(text)
# 移除多餘空格
text <- gsub("\\s+", " ", text)
text <- trimws(text)
return(text)
}
# 處理中英文混合文本
clean_chinese_text <- function(text) {
# 保留中文、英文、數字和基本標點
text <- gsub("[^\u4e00-\u9fa5a-zA-Z0-9,。!?、;:\"\"''\\s]", "", text, perl = TRUE)
# 處理多餘空格
text <- gsub("\\s+", " ", text)
text <- trimws(text)
return(text)
}
# 示例
chinese_text <- "R語言(R language)是一種用於統計計算和圖形的編程語言!"
clean_chinese_text(chinese_text)
1.4 中文分詞
# 安裝中文分詞包
install.packages("jiebaR")
library(jiebaR)
# 初始化分詞引擎
seg_engine <- worker()
# 簡單分詞
text <- "我喜歡學習R語言和數據分析"
segment_result <- segment(text, seg_engine)
print(segment_result)
# 添加自定義詞典
add_words <- c("R語言", "數據分析", "機器學習", "深度學習")
new_user_word(seg_engine, add_words)
# 使用自定義詞典分詞
segment_with_dict <- segment(text, seg_engine)
print(segment_with_dict)
# 批量處理文本
batch_segment <- function(texts, engine) {
results <- list()
for (i in seq_along(texts)) {
results[[i]] <- segment(texts[i], engine)
}
return(results)
}
# 對所有文本進行分詞
text_data$segmented <- batch_segment(text_data$clean_text, seg_engine)
# 查看分詞結果
for (i in 1:3) {
cat("原始:", text_data$clean_text[i], "\n")
cat("分詞:", paste(text_data$segmented[[i]], collapse = " | "), "\n\n")
}
# 詞性標註(需要其他包或服務)
# 可以使用LTP或THULAC等工具進行中文詞性標註
1.5 停用詞處理
# 加載中文停用詞
# 創建中文停用詞列表
chinese_stopwords <- c(
"的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一個",
"上", "也", "很", "到", "説", "要", "去", "你", "會", "着", "沒有", "看", "好",
"自己", "這", "那", "他", "她", "它", "我們", "你們", "他們", "她們", "它們",
"這", "那", "這些", "那些", "這個", "那個", "這裏", "那裏", "這樣", "那樣",
"這麼", "那麼", "怎麼", "什麼", "為什麼", "因為", "所以", "但是", "而且",
"或者", "如果", "雖然", "然後", "而且", "以及", "等等", "等等等", "等等等等",
"等", "等等", "等等等", "等等等等", "等等等等等"
)
# 英文停用詞(使用tm包)
english_stopwords <- stopwords("en")
# 查看英文停用詞
head(english_stopwords, 20)
# 自定義停用詞列表
custom_stopwords <- c("可以", "應該", "可能", "能夠", "需要", "想要", "希望")
# 移除停用詞函數
remove_stopwords <- function(words, stopwords_list) {
words[!words %in% stopwords_list]
}
# 示例
sample_words <- c("我", "喜歡", "學習", "的", "r語言", "和", "數據分析")
filtered_words <- remove_stopwords(sample_words, chinese_stopwords)
print(filtered_words)
# 處理整個數據集
text_data$filtered <- lapply(text_data$segmented, function(x) {
remove_stopwords(x, chinese_stopwords)
})
# 查看處理結果
for (i in 1:3) {
cat("原始分詞:", paste(text_data$segmented[[i]], collapse = " | "), "\n")
cat("移除停用詞:", paste(text_data$filtered[[i]], collapse = " | "), "\n\n")
}
# 使用tidytext處理停用詞
library(tidytext)
# 獲取中文停用詞(從網絡或本地文件)
# chinese_stopwords_df <- data.frame(word = chinese_stopwords)
📊 第二部分:詞頻分析與TF-IDF
2.1 詞頻統計
# 基本詞頻統計
calculate_word_freq <- function(word_lists) {
# 合併所有單詞
all_words <- unlist(word_lists)
# 計算詞頻
word_freq <- table(all_words)
# 轉換為數據框並排序
freq_df <- data.frame(
word = names(word_freq),
freq = as.numeric(word_freq),
stringsAsFactors = FALSE
) %>%
arrange(desc(freq))
return(freq_df)
}
# 計算示例數據的詞頻
freq_df <- calculate_word_freq(text_data$filtered)
head(freq_df, 10)
# 使用tidytext進行詞頻分析
tidy_text <- text_data %>%
select(id, filtered) %>%
unnest(filtered) %>%
rename(word = filtered) %>%
filter(!word %in% chinese_stopwords)
# 統計詞頻
word_counts <- tidy_text %>%
count(word, sort = TRUE)
head(word_counts, 15)
# 可視化詞頻
library(ggplot2)
# 創建詞頻條形圖
word_counts_top20 <- word_counts %>%
head(20)
ggplot(word_counts_top20, aes(x = reorder(word, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "詞頻統計(Top 20)",
x = "詞語",
y = "出現次數"
) +
theme_minimal()
# 詞雲
library(wordcloud)
# 準備詞雲數據
wordcloud_df <- word_counts %>%
filter(n > 1) # 只顯示出現次數大於1的詞
# 生成詞雲
wordcloud(
words = wordcloud_df$word,
freq = wordcloud_df$n,
max.words = 50,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"),
scale = c(3, 0.5)
)
2.2 N-gram分析
# 雙詞詞組(bigram)分析
library(tidytext)
# 創建bigrams
bigrams <- text_data %>%
select(id, clean_text) %>%
unnest_tokens(bigram, clean_text, token = "ngrams", n = 2)
# 查看bigrams
head(bigrams, 10)
# 統計bigram頻率
bigram_counts <- bigrams %>%
count(bigram, sort = TRUE)
head(bigram_counts, 10)
# 分離bigram
bigrams_separated <- bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
# 過濾停用詞
bigrams_filtered <- bigrams_separated %>%
filter(
!word1 %in% chinese_stopwords,
!word2 %in% chinese_stopwords
)
# 重新組合並統計
bigram_counts_clean <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, sort = TRUE)
head(bigram_counts_clean, 10)
# 創建bigram網絡圖
install.packages("igraph")
install.packages("ggraph")
library(igraph)
library(ggraph)
# 準備網絡數據
bigram_graph <- bigram_counts_clean %>%
filter(n > 1) %>% # 只顯示頻率大於1的bigrams
separate(bigram, c("from", "to"), sep = " ") %>%
graph_from_data_frame()
# 繪製網絡圖
set.seed(123)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
labs(title = "詞語關聯網絡")
# 三元詞組(trigram)分析
trigrams <- text_data %>%
select(id, clean_text) %>%
unnest_tokens(trigram, clean_text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(
!word1 %in% chinese_stopwords,
!word2 %in% chinese_stopwords,
!word3 %in% chinese_stopwords
) %>%
unite(trigram, word1, word2, word3, sep = " ") %>%
count(trigram, sort = TRUE)
head(trigrams, 10)
2.3 TF-IDF分析
# TF-IDF(詞頻-逆文檔頻率)分析
# TF(詞頻):詞語在文檔中出現的頻率
# IDF(逆文檔頻率):衡量詞語在整個文檔集中的重要性
# 使用tidytext計算TF-IDF
text_data_tfidf <- text_data %>%
select(id, category, filtered) %>%
unnest(filtered) %>%
rename(word = filtered) %>%
filter(!word %in% chinese_stopwords) %>%
count(category, word, sort = TRUE) %>%
ungroup()
# 計算TF-IDF
total_words <- text_data_tfidf %>%
group_by(category) %>%
summarize(total = sum(n))
text_data_tfidf <- left_join(text_data_tfidf, total_words)
text_data_tfidf <- text_data_tfidf %>%
bind_tf_idf(word, category, n)
# 查看結果
head(text_data_tfidf %>% arrange(desc(tf_idf)))
# 每個類別最重要的詞語(按TF-IDF排序)
top_words_by_category <- text_data_tfidf %>%
group_by(category) %>%
top_n(3, tf_idf) %>%
ungroup() %>%
arrange(category, desc(tf_idf))
print(top_words_by_category)
# 可視化每個類別的關鍵詞語
top_words_by_category %>%
mutate(word = reorder_within(word, tf_idf, category)) %>%
ggplot(aes(x = word, y = tf_idf, fill = category)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ category, scales = "free") +
coord_flip() +
scale_x_reordered() +
labs(
title = "各主題最重要的詞語(TF-IDF)",
x = "詞語",
y = "TF-IDF值"
) +
theme_minimal()
# 計算文檔級別的TF-IDF
document_tfidf <- text_data %>%
select(id, filtered) %>%
unnest(filtered) %>%
rename(word = filtered) %>%
filter(!word %in% chinese_stopwords) %>%
count(id, word, sort = TRUE) %>%
bind_tf_idf(word, id, n)
# 查看每個文檔最重要的詞語
top_words_per_document <- document_tfidf %>%
group_by(id) %>%
top_n(3, tf_idf) %>%
ungroup() %>%
arrange(id, desc(tf_idf))
print(top_words_per_document)
# 創建文檔-詞語矩陣
dtm <- document_tfidf %>%
cast_dtm(id, word, n)
# 查看文檔-詞語矩陣
inspect(dtm[1:3, 1:10])
😊 第三部分:情感分析
3.1 基於詞典的情感分析
# 加載情感詞典
# 使用syuzhet包進行情感分析
library(syuzhet)
# 中文情感分析(需要中文情感詞典)
# 創建簡單的中文情感詞典
positive_chinese <- c(
"喜歡", "愛", "好", "優秀", "棒", "贊", "強大", "有趣", "出色",
"重要", "直觀", "易懂", "改變", "優秀", "強大", "有趣", "出色"
)
negative_chinese <- c(
"討厭", "恨", "差", "糟糕", "爛", "困難", "複雜", "難懂", "失敗",
"問題", "錯誤", "不足", "缺陷", "限制", "挑戰", "困難"
)
# 簡單情感分析函數
simple_sentiment_analysis <- function(text, pos_dict, neg_dict) {
words <- segment(text, seg_engine)
pos_count <- sum(words %in% pos_dict)
neg_count <- sum(words %in% neg_dict)
sentiment_score <- pos_count - neg_count
return(list(
positive = pos_count,
negative = neg_count,
score = sentiment_score,
sentiment = ifelse(sentiment_score > 0, "積極",
ifelse(sentiment_score < 0, "消極", "中性"))
))
}
# 測試情感分析
test_text <- "我喜歡學習R語言,它功能強大而且很有趣。"
sentiment_result <- simple_sentiment_analysis(test_text, positive_chinese, negative_chinese)
print(sentiment_result)
# 對整個數據集進行情感分析
text_data$sentiment <- sapply(text_data$clean_text, function(text) {
result <- simple_sentiment_analysis(text, positive_chinese, negative_chinese)
return(result$sentiment)
})
# 查看情感分析結果
table(text_data$sentiment)
# 使用英文情感詞典(NRC詞典)
library(textdata)
# 下載NRC情感詞典(第一次運行需要下載)
# nrc_dict <- get_sentiments("nrc")
# 或者使用內置的英文詞典
afinn_dict <- get_sentiments("afinn")
bing_dict <- get_sentiments("bing")
# 查看詞典
head(afinn_dict)
head(bing_dict)
# 英文情感分析示例
english_text <- "I love learning R programming. It is powerful and interesting."
english_words <- unlist(strsplit(tolower(english_text), "\\W+"))
# 使用AFINN詞典(數值評分)
afinn_scores <- afinn_dict$value[match(english_words, afinn_dict$word)]
total_score <- sum(afinn_scores, na.rm = TRUE)
# 使用BING詞典(積極/消極)
bing_sentiments <- bing_dict$sentiment[match(english_words, bing_dict$word)]
sentiment_counts <- table(bing_sentiments[!is.na(bing_sentiments)])
cat("AFINN總評分:", total_score, "\n")
cat("BING情感統計:\n")
print(sentiment_counts)
3.2 情感趨勢分析
# 創建時間序列文本數據(模擬數據)
set.seed(123)
time_series_text <- data.frame(
date = seq.Date(as.Date("2023-01-01"), as.Date("2023-01-31"), by = "day"),
text = c(
"今天很開心,項目進展順利。",
"遇到了一些技術難題,需要研究。",
"團隊合作很愉快,大家都很努力。",
"客户反饋不錯,繼續加油。",
"系統出現了一些問題,需要修復。",
"學習新知識很有收穫。",
"工作壓力有點大,需要調整。",
"完成了一個重要里程碑,值得慶祝。",
"遇到了挑戰,但能夠克服。",
"收到了積極的評價,很開心。",
"技術問題還沒有解決,有些困擾。",
"團隊氛圍很好,工作效率高。",
"項目延期了,需要加快進度。",
"學到了很多新技能,很有成就感。",
"工作與生活需要更好的平衡。",
"取得了不錯的成果,繼續努力。",
"遇到了一些挫折,但不會放棄。",
"收到了建設性的反饋,有助於改進。",
"工作節奏太快,需要休息。",
"看到自己的進步,很欣慰。",
"技術更新很快,需要不斷學習。",
"團隊溝通很順暢,合作愉快。",
"工作量有些大,需要合理安排。",
"解決了長期困擾的問題,很有成就感。",
"收到了客户的感謝,很感動。",
"遇到了一些困難,但相信能解決。",
"工作環境很舒適,有利於專注。",
"時間管理需要改進。",
"完成了既定目標,值得慶祝。",
"面臨新的挑戰,準備迎接。",
"整體來説,這個月收穫很多。"
),
stringsAsFactors = FALSE
)
# 情感分析
time_series_text$sentiment_score <- sapply(time_series_text$text, function(text) {
result <- simple_sentiment_analysis(text, positive_chinese, negative_chinese)
return(result$score)
})
# 情感分類
time_series_text$sentiment_category <- ifelse(
time_series_text$sentiment_score > 0, "積極",
ifelse(time_series_text$sentiment_score < 0, "消極", "中性")
)
# 查看結果
head(time_series_text)
# 情感趨勢可視化
library(ggplot2)
# 每日情感得分趨勢
ggplot(time_series_text, aes(x = date, y = sentiment_score)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "darkblue", size = 2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
labs(
title = "情感得分趨勢",
x = "日期",
y = "情感得分"
) +
theme_minimal()
# 情感類別分佈
sentiment_summary <- time_series_text %>%
group_by(sentiment_category) %>%
summarize(count = n(), percentage = n() / nrow(time_series_text) * 100)
ggplot(sentiment_summary, aes(x = "", y = count, fill = sentiment_category)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(label = paste0(round(percentage, 1), "%")),
position = position_stack(vjust = 0.5)) +
labs(
title = "情感類別分佈",
fill = "情感類別"
) +
theme_void()
# 移動平均平滑情感趨勢
library(zoo)
time_series_text$sentiment_ma <- rollmean(
time_series_text$sentiment_score,
k = 7,
fill = NA,
align = "right"
)
# 繪製原始得分和移動平均
ggplot(time_series_text, aes(x = date)) +
geom_line(aes(y = sentiment_score), color = "lightblue", alpha = 0.5, size = 0.8) +
geom_line(aes(y = sentiment_ma), color = "darkblue", size = 1.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
labs(
title = "情感得分趨勢(原始與7日移動平均)",
x = "日期",
y = "情感得分"
) +
theme_minimal()
3.3 高級情感分析
# 使用機器學習進行情感分析
# 準備訓練數據
# 這裏使用模擬數據,實際應用需要標註好的情感數據
set.seed(123)
n_samples <- 100
train_data <- data.frame(
text = c(
# 積極樣本
"這個產品非常好用,我很喜歡。",
"服務質量優秀,非常滿意。",
"體驗很棒,下次還會再來。",
"效果超出預期,值得推薦。",
"操作簡單,功能強大。",
"性價比很高,物超所值。",
"設計精美,質量上乘。",
"客服態度很好,解決問題迅速。",
"使用方便,體驗流暢。",
"非常實用,幫助很大。",
# 消極樣本
"產品質量很差,很失望。",
"服務態度惡劣,不滿意。",
"效果不好,不推薦購買。",
"操作複雜,很難使用。",
"價格太貴,不值這個價。",
"設計不合理,使用不便。",
"客服迴應慢,問題沒解決。",
"功能有限,不夠實用。",
"存在缺陷,需要改進。",
"體驗糟糕,不會再來。"
),
sentiment = c(rep("積極", 10), rep("消極", 10)),
stringsAsFactors = FALSE
)
# 重複數據以增加樣本量
train_data <- train_data[rep(1:nrow(train_data), 5), ]
rownames(train_data) <- NULL
# 添加更多樣本
additional_texts <- c(
"非常喜歡,強烈推薦!",
"太棒了,超出預期!",
"完美,無可挑剔!",
"糟糕透頂,再也不買了!",
"太差了,浪費時間!",
"一般般,沒什麼特別的。",
"還行,可以接受。",
"馬馬虎虎,勉強能用。"
)
additional_sentiments <- c("積極", "積極", "積極", "消極", "消極", "中性", "中性", "中性")
train_data <- rbind(
train_data,
data.frame(
text = additional_texts,
sentiment = additional_sentiments,
stringsAsFactors = FALSE
)
)
# 文本預處理
preprocess_text <- function(text) {
text <- tolower(text)
text <- gsub("[[:punct:]]", " ", text)
text <- gsub("\\s+", " ", text)
text <- trimws(text)
return(text)
}
train_data$clean_text <- sapply(train_data$text, preprocess_text)
# 創建文檔-詞項矩陣
library(tm)
# 創建語料庫
corpus <- VCorpus(VectorSource(train_data$clean_text))
# 創建文檔-詞項矩陣
dtm <- DocumentTermMatrix(corpus, control = list(
weighting = weightTf,
stopwords = TRUE,
minWordLength = 2,
removeNumbers = TRUE,
removePunctuation = TRUE
))
# 轉換為數據框
dtm_df <- as.data.frame(as.matrix(dtm))
# 添加情感標籤
dtm_df$sentiment <- as.factor(train_data$sentiment)
# 拆分訓練集和測試集
set.seed(123)
train_index <- sample(1:nrow(dtm_df), 0.7 * nrow(dtm_df))
train_set <- dtm_df[train_index, ]
test_set <- dtm_df[-train_index, ]
# 訓練樸素貝葉斯分類器
library(e1071)
nb_model <- naiveBayes(sentiment ~ ., data = train_set)
# 預測
predictions <- predict(nb_model, test_set)
# 評估模型
confusion_matrix <- table(Predicted = predictions, Actual = test_set$sentiment)
print(confusion_matrix)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("準確率:", round(accuracy * 100, 2), "%\n")
# 使用新文本進行預測
new_texts <- c(
"這個產品真的很不錯,我很滿意。",
"質量太差了,非常失望。",
"一般般,沒什麼感覺。"
)
predict_sentiment <- function(texts, model, corpus_reference) {
# 預處理
clean_texts <- sapply(texts, preprocess_text)
# 創建新語料庫
new_corpus <- VCorpus(VectorSource(clean_texts))
# 使用相同的控制參數創建DTM
new_dtm <- DocumentTermMatrix(new_corpus, control = list(
dictionary = Terms(corpus_reference),
weighting = weightTf,
stopwords = TRUE,
minWordLength = 2,
removeNumbers = TRUE,
removePunctuation = TRUE
))
new_dtm_df <- as.data.frame(as.matrix(new_dtm))
# 預測
predictions <- predict(model, new_dtm_df)
return(data.frame(
text = texts,
predicted_sentiment = predictions,
stringsAsFactors = FALSE
))
}
# 測試預測
results <- predict_sentiment(new_texts, nb_model, corpus)
print(results)
🧩 第四部分:主題建模
4.1 LDA主題模型基礎
# LDA(Latent Dirichlet Allocation)主題建模
# 準備數據
library(topicmodels)
library(tidytext)
# 創建更大的文本數據集
set.seed(123)
n_docs <- 50
# 創建三個主題的文檔
topic1_docs <- replicate(20, paste(
sample(c("數據", "分析", "統計", "模型", "算法", "預測", "學習", "訓練"), 8, replace = TRUE),
collapse = " "
))
topic2_docs <- replicate(15, paste(
sample(c("文本", "挖掘", "語言", "處理", "情感", "分析", "分類", "聚類"), 8, replace = TRUE),
collapse = " "
))
topic3_docs <- replicate(15, paste(
sample(c("可視", "圖表", "圖形", "展示", "交互", "設計", "顏色", "佈局"), 8, replace = TRUE),
collapse = " "
))
all_docs <- c(topic1_docs, topic2_docs, topic3_docs)
# 創建文檔數據框
lda_data <- data.frame(
doc_id = 1:n_docs,
text = all_docs,
true_topic = c(rep("數據分析", 20), rep("文本挖掘", 15), rep("數據可視化", 15)),
stringsAsFactors = FALSE
)
# 預處理:分詞
lda_data$words <- lapply(lda_data$text, function(x) {
words <- unlist(strsplit(x, " "))
words[!words %in% chinese_stopwords]
})
# 創建文檔-詞項矩陣
# 首先創建詞頻數據框
word_freq_list <- list()
for (i in 1:nrow(lda_data)) {
words <- lda_data$words[[i]]
word_counts <- table(words)
for (word in names(word_counts)) {
word_freq_list[[length(word_freq_list) + 1]] <- data.frame(
doc_id = i,
word = word,
freq = as.numeric(word_counts[word])
)
}
}
word_freq_df <- do.call(rbind, word_freq_list)
# 創建文檔-詞項矩陣
dtm_lda <- word_freq_df %>%
cast_dtm(doc_id, word, freq)
# 查看DTM
inspect(dtm_lda[1:5, 1:10])
dim(dtm_lda)
# 運行LDA模型
set.seed(123)
lda_model <- LDA(dtm_lda, k = 3, control = list(seed = 123))
# 查看模型
lda_model
# 提取主題-詞語分佈
topic_word_dist <- tidy(lda_model, matrix = "beta")
# 查看每個主題最重要的10個詞
top_terms <- topic_word_dist %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
print(top_terms)
# 可視化主題-詞語分佈
library(ggplot2)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
labs(
title = "LDA主題模型 - 主題詞語分佈",
x = "詞語",
y = "概率(beta)"
) +
theme_minimal()
4.2 文檔-主題分佈
# 提取文檔-主題分佈
doc_topic_dist <- tidy(lda_model, matrix = "gamma")
# 查看文檔主題分佈
head(doc_topic_dist)
# 為每個文檔分配主要主題
doc_classifications <- doc_topic_dist %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup()
# 查看文檔分類
doc_classifications <- doc_classifications %>%
mutate(
document = as.integer(document),
predicted_topic = paste("主題", topic)
)
# 與真實主題比較
comparison <- lda_data %>%
select(doc_id, true_topic) %>%
left_join(doc_classifications, by = c("doc_id" = "document"))
# 查看比較結果
head(comparison)
# 計算準確率
comparison$correct <- comparison$true_topic == comparison$predicted_topic
accuracy <- mean(comparison$correct)
cat("主題分類準確率:", round(accuracy * 100, 2), "%\n")
# 可視化文檔-主題分佈
# 選擇部分文檔進行可視化
sample_docs <- sample(1:n_docs, 10)
doc_topic_dist %>%
filter(as.integer(document) %in% sample_docs) %>%
mutate(document = factor(document, levels = as.character(sample_docs))) %>%
ggplot(aes(factor(topic), gamma, fill = factor(topic))) +
geom_col() +
facet_wrap(~ document, ncol = 5) +
labs(
title = "文檔-主題分佈(示例文檔)",
x = "主題",
y = "概率(gamma)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
4.3 主題模型優化與評估
# 尋找最佳主題數
# 使用困惑度(perplexity)評估模型
# 創建不同主題數的模型
topic_numbers <- c(2, 3, 4, 5, 6, 7, 8)
perplexity_values <- numeric(length(topic_numbers))
set.seed(123)
for (i in seq_along(topic_numbers)) {
k <- topic_numbers[i]
cat("訓練主題數 k =", k, "的模型...\n")
lda_temp <- LDA(dtm_lda, k = k, control = list(seed = 123))
perplexity_values[i] <- perplexity(lda_temp, dtm_lda)
}
# 創建結果數據框
perplexity_df <- data.frame(
k = topic_numbers,
perplexity = perplexity_values
)
# 可視化困惑度
ggplot(perplexity_df, aes(x = k, y = perplexity)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "darkblue", size = 3) +
labs(
title = "LDA模型困惑度 vs 主題數",
x = "主題數(k)",
y = "困惑度"
) +
theme_minimal()
# 使用最佳主題數重新訓練模型
best_k <- topic_numbers[which.min(perplexity_values)]
cat("最佳主題數:", best_k, "\n")
set.seed(123)
best_lda_model <- LDA(dtm_lda, k = best_k, control = list(seed = 123))
# 提取主題
best_topic_terms <- tidy(best_lda_model, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# 可視化最佳模型的主題
best_topic_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
labs(
title =