📚 今日目標

  1. 掌握文本預處理基本技術
  2. 學習詞頻分析和TF-IDF
  3. 掌握情感分析方法
  4. 學習主題建模(LDA)
  5. 實踐文本分類

📝 第一部分:文本預處理

1.1 安裝必要的包

# 安裝文本挖掘相關包
install.packages(c("tm", "tidytext", "text2vec", "quanteda", 
                   "topicmodels", "wordcloud", "syuzhet",
                   "textstem", "stopwords", "textdata"))

# 安裝tidyverse用於數據處理
install.packages("tidyverse")

# 加載包
library(tm)          # 文本挖掘
library(tidytext)    # tidytext方法
library(tidyverse)   # 數據處理
library(wordcloud)   # 詞雲
library(text2vec)    # 文本向量化
library(quanteda)    # 量化文本分析
library(topicmodels) # 主題模型

1.2 文本數據加載

# 創建示例文本數據
text_data <- data.frame(
  id = 1:10,
  text = c(
    "我喜歡學習R語言,因為它功能強大。",
    "數據分析是一項很有趣的工作。",
    "機器學習可以幫助我們做出更好的預測。",
    "文本挖掘是從文本中提取有用信息的技術。",
    "自然語言處理是人工智能的重要分支。",
    "深度學習在圖像識別方面表現出色。",
    "統計學是數據科學的基礎。",
    "數據可視化能讓數據更直觀易懂。",
    "大數據技術正在改變世界。",
    "R語言和Python都是優秀的數據科學工具。"
  ),
  category = c("編程", "數據分析", "機器學習", "文本挖掘", 
               "人工智能", "深度學習", "統計學", "可視化", 
               "大數據", "數據科學"),
  stringsAsFactors = FALSE
)

# 查看數據
head(text_data)

# 從文件讀取文本數據
# 讀取CSV文件
# text_from_csv <- read.csv("text_data.csv", stringsAsFactors = FALSE)

# 讀取TXT文件
# text_from_txt <- readLines("text_data.txt", encoding = "UTF-8")

# 讀取PDF文件(需要pdftools包)
# install.packages("pdftools")
# library(pdftools)
# text_from_pdf <- pdf_text("document.pdf")

# 讀取Word文檔(需要textreadr包)
# install.packages("textreadr")
# library(textreadr)
# text_from_docx <- read_docx("document.docx")

1.3 文本預處理函數

# 創建文本預處理函數
clean_text <- function(text) {
  # 轉換為小寫
  text <- tolower(text)
  
  # 移除標點符號
  text <- gsub("[[:punct:]]", " ", text)
  
  # 移除數字
  text <- gsub("[[:digit:]]", " ", text)
  
  # 移除多餘空格
  text <- gsub("\\s+", " ", text)
  
  # 移除首尾空格
  text <- trimws(text)
  
  return(text)
}

# 應用清理函數
text_data$clean_text <- sapply(text_data$text, clean_text)

# 查看清理結果
cat("原始文本:", text_data$text[1], "\n")
cat("清理後:", text_data$clean_text[1], "\n")

# 更高級的預處理函數
advanced_text_cleaning <- function(text) {
  # 移除URL
  text <- gsub("https?://\\S+|www\\.\\S+", "", text)
  
  # 移除HTML標籤
  text <- gsub("<.*?>", "", text)
  
  # 移除特殊字符
  text <- gsub("[^\u4e00-\u9fa5a-zA-Z0-9\\s]", "", text, perl = TRUE)
  
  # 轉換為小寫
  text <- tolower(text)
  
  # 移除多餘空格
  text <- gsub("\\s+", " ", text)
  text <- trimws(text)
  
  return(text)
}

# 處理中英文混合文本
clean_chinese_text <- function(text) {
  # 保留中文、英文、數字和基本標點
  text <- gsub("[^\u4e00-\u9fa5a-zA-Z0-9,。!?、;:\"\"''\\s]", "", text, perl = TRUE)
  
  # 處理多餘空格
  text <- gsub("\\s+", " ", text)
  text <- trimws(text)
  
  return(text)
}

# 示例
chinese_text <- "R語言(R language)是一種用於統計計算和圖形的編程語言!"
clean_chinese_text(chinese_text)

1.4 中文分詞

# 安裝中文分詞包
install.packages("jiebaR")
library(jiebaR)

# 初始化分詞引擎
seg_engine <- worker()

# 簡單分詞
text <- "我喜歡學習R語言和數據分析"
segment_result <- segment(text, seg_engine)
print(segment_result)

# 添加自定義詞典
add_words <- c("R語言", "數據分析", "機器學習", "深度學習")
new_user_word(seg_engine, add_words)

# 使用自定義詞典分詞
segment_with_dict <- segment(text, seg_engine)
print(segment_with_dict)

# 批量處理文本
batch_segment <- function(texts, engine) {
  results <- list()
  for (i in seq_along(texts)) {
    results[[i]] <- segment(texts[i], engine)
  }
  return(results)
}

# 對所有文本進行分詞
text_data$segmented <- batch_segment(text_data$clean_text, seg_engine)

# 查看分詞結果
for (i in 1:3) {
  cat("原始:", text_data$clean_text[i], "\n")
  cat("分詞:", paste(text_data$segmented[[i]], collapse = " | "), "\n\n")
}

# 詞性標註(需要其他包或服務)
# 可以使用LTP或THULAC等工具進行中文詞性標註

1.5 停用詞處理

# 加載中文停用詞
# 創建中文停用詞列表
chinese_stopwords <- c(
  "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一個",
  "上", "也", "很", "到", "説", "要", "去", "你", "會", "着", "沒有", "看", "好",
  "自己", "這", "那", "他", "她", "它", "我們", "你們", "他們", "她們", "它們",
  "這", "那", "這些", "那些", "這個", "那個", "這裏", "那裏", "這樣", "那樣",
  "這麼", "那麼", "怎麼", "什麼", "為什麼", "因為", "所以", "但是", "而且",
  "或者", "如果", "雖然", "然後", "而且", "以及", "等等", "等等等", "等等等等",
  "等", "等等", "等等等", "等等等等", "等等等等等"
)

# 英文停用詞(使用tm包)
english_stopwords <- stopwords("en")

# 查看英文停用詞
head(english_stopwords, 20)

# 自定義停用詞列表
custom_stopwords <- c("可以", "應該", "可能", "能夠", "需要", "想要", "希望")

# 移除停用詞函數
remove_stopwords <- function(words, stopwords_list) {
  words[!words %in% stopwords_list]
}

# 示例
sample_words <- c("我", "喜歡", "學習", "的", "r語言", "和", "數據分析")
filtered_words <- remove_stopwords(sample_words, chinese_stopwords)
print(filtered_words)

# 處理整個數據集
text_data$filtered <- lapply(text_data$segmented, function(x) {
  remove_stopwords(x, chinese_stopwords)
})

# 查看處理結果
for (i in 1:3) {
  cat("原始分詞:", paste(text_data$segmented[[i]], collapse = " | "), "\n")
  cat("移除停用詞:", paste(text_data$filtered[[i]], collapse = " | "), "\n\n")
}

# 使用tidytext處理停用詞
library(tidytext)

# 獲取中文停用詞(從網絡或本地文件)
# chinese_stopwords_df <- data.frame(word = chinese_stopwords)

📊 第二部分:詞頻分析與TF-IDF

2.1 詞頻統計

# 基本詞頻統計
calculate_word_freq <- function(word_lists) {
  # 合併所有單詞
  all_words <- unlist(word_lists)
  
  # 計算詞頻
  word_freq <- table(all_words)
  
  # 轉換為數據框並排序
  freq_df <- data.frame(
    word = names(word_freq),
    freq = as.numeric(word_freq),
    stringsAsFactors = FALSE
  ) %>%
    arrange(desc(freq))
  
  return(freq_df)
}

# 計算示例數據的詞頻
freq_df <- calculate_word_freq(text_data$filtered)
head(freq_df, 10)

# 使用tidytext進行詞頻分析
tidy_text <- text_data %>%
  select(id, filtered) %>%
  unnest(filtered) %>%
  rename(word = filtered) %>%
  filter(!word %in% chinese_stopwords)

# 統計詞頻
word_counts <- tidy_text %>%
  count(word, sort = TRUE)

head(word_counts, 15)

# 可視化詞頻
library(ggplot2)

# 創建詞頻條形圖
word_counts_top20 <- word_counts %>%
  head(20)

ggplot(word_counts_top20, aes(x = reorder(word, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "詞頻統計(Top 20)",
    x = "詞語",
    y = "出現次數"
  ) +
  theme_minimal()

# 詞雲
library(wordcloud)

# 準備詞雲數據
wordcloud_df <- word_counts %>%
  filter(n > 1)  # 只顯示出現次數大於1的詞

# 生成詞雲
wordcloud(
  words = wordcloud_df$word,
  freq = wordcloud_df$n,
  max.words = 50,
  random.order = FALSE,
  colors = brewer.pal(8, "Dark2"),
  scale = c(3, 0.5)
)

2.2 N-gram分析

# 雙詞詞組(bigram)分析
library(tidytext)

# 創建bigrams
bigrams <- text_data %>%
  select(id, clean_text) %>%
  unnest_tokens(bigram, clean_text, token = "ngrams", n = 2)

# 查看bigrams
head(bigrams, 10)

# 統計bigram頻率
bigram_counts <- bigrams %>%
  count(bigram, sort = TRUE)

head(bigram_counts, 10)

# 分離bigram
bigrams_separated <- bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

# 過濾停用詞
bigrams_filtered <- bigrams_separated %>%
  filter(
    !word1 %in% chinese_stopwords,
    !word2 %in% chinese_stopwords
  )

# 重新組合並統計
bigram_counts_clean <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ") %>%
  count(bigram, sort = TRUE)

head(bigram_counts_clean, 10)

# 創建bigram網絡圖
install.packages("igraph")
install.packages("ggraph")
library(igraph)
library(ggraph)

# 準備網絡數據
bigram_graph <- bigram_counts_clean %>%
  filter(n > 1) %>%  # 只顯示頻率大於1的bigrams
  separate(bigram, c("from", "to"), sep = " ") %>%
  graph_from_data_frame()

# 繪製網絡圖
set.seed(123)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() +
  labs(title = "詞語關聯網絡")

# 三元詞組(trigram)分析
trigrams <- text_data %>%
  select(id, clean_text) %>%
  unnest_tokens(trigram, clean_text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(
    !word1 %in% chinese_stopwords,
    !word2 %in% chinese_stopwords,
    !word3 %in% chinese_stopwords
  ) %>%
  unite(trigram, word1, word2, word3, sep = " ") %>%
  count(trigram, sort = TRUE)

head(trigrams, 10)

2.3 TF-IDF分析

# TF-IDF(詞頻-逆文檔頻率)分析
# TF(詞頻):詞語在文檔中出現的頻率
# IDF(逆文檔頻率):衡量詞語在整個文檔集中的重要性

# 使用tidytext計算TF-IDF
text_data_tfidf <- text_data %>%
  select(id, category, filtered) %>%
  unnest(filtered) %>%
  rename(word = filtered) %>%
  filter(!word %in% chinese_stopwords) %>%
  count(category, word, sort = TRUE) %>%
  ungroup()

# 計算TF-IDF
total_words <- text_data_tfidf %>%
  group_by(category) %>%
  summarize(total = sum(n))

text_data_tfidf <- left_join(text_data_tfidf, total_words)

text_data_tfidf <- text_data_tfidf %>%
  bind_tf_idf(word, category, n)

# 查看結果
head(text_data_tfidf %>% arrange(desc(tf_idf)))

# 每個類別最重要的詞語(按TF-IDF排序)
top_words_by_category <- text_data_tfidf %>%
  group_by(category) %>%
  top_n(3, tf_idf) %>%
  ungroup() %>%
  arrange(category, desc(tf_idf))

print(top_words_by_category)

# 可視化每個類別的關鍵詞語
top_words_by_category %>%
  mutate(word = reorder_within(word, tf_idf, category)) %>%
  ggplot(aes(x = word, y = tf_idf, fill = category)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ category, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  labs(
    title = "各主題最重要的詞語(TF-IDF)",
    x = "詞語",
    y = "TF-IDF值"
  ) +
  theme_minimal()

# 計算文檔級別的TF-IDF
document_tfidf <- text_data %>%
  select(id, filtered) %>%
  unnest(filtered) %>%
  rename(word = filtered) %>%
  filter(!word %in% chinese_stopwords) %>%
  count(id, word, sort = TRUE) %>%
  bind_tf_idf(word, id, n)

# 查看每個文檔最重要的詞語
top_words_per_document <- document_tfidf %>%
  group_by(id) %>%
  top_n(3, tf_idf) %>%
  ungroup() %>%
  arrange(id, desc(tf_idf))

print(top_words_per_document)

# 創建文檔-詞語矩陣
dtm <- document_tfidf %>%
  cast_dtm(id, word, n)

# 查看文檔-詞語矩陣
inspect(dtm[1:3, 1:10])

😊 第三部分:情感分析

3.1 基於詞典的情感分析

# 加載情感詞典
# 使用syuzhet包進行情感分析
library(syuzhet)

# 中文情感分析(需要中文情感詞典)
# 創建簡單的中文情感詞典
positive_chinese <- c(
  "喜歡", "愛", "好", "優秀", "棒", "贊", "強大", "有趣", "出色",
  "重要", "直觀", "易懂", "改變", "優秀", "強大", "有趣", "出色"
)

negative_chinese <- c(
  "討厭", "恨", "差", "糟糕", "爛", "困難", "複雜", "難懂", "失敗",
  "問題", "錯誤", "不足", "缺陷", "限制", "挑戰", "困難"
)

# 簡單情感分析函數
simple_sentiment_analysis <- function(text, pos_dict, neg_dict) {
  words <- segment(text, seg_engine)
  
  pos_count <- sum(words %in% pos_dict)
  neg_count <- sum(words %in% neg_dict)
  
  sentiment_score <- pos_count - neg_count
  
  return(list(
    positive = pos_count,
    negative = neg_count,
    score = sentiment_score,
    sentiment = ifelse(sentiment_score > 0, "積極", 
                       ifelse(sentiment_score < 0, "消極", "中性"))
  ))
}

# 測試情感分析
test_text <- "我喜歡學習R語言,它功能強大而且很有趣。"
sentiment_result <- simple_sentiment_analysis(test_text, positive_chinese, negative_chinese)
print(sentiment_result)

# 對整個數據集進行情感分析
text_data$sentiment <- sapply(text_data$clean_text, function(text) {
  result <- simple_sentiment_analysis(text, positive_chinese, negative_chinese)
  return(result$sentiment)
})

# 查看情感分析結果
table(text_data$sentiment)

# 使用英文情感詞典(NRC詞典)
library(textdata)

# 下載NRC情感詞典(第一次運行需要下載)
# nrc_dict <- get_sentiments("nrc")

# 或者使用內置的英文詞典
afinn_dict <- get_sentiments("afinn")
bing_dict <- get_sentiments("bing")

# 查看詞典
head(afinn_dict)
head(bing_dict)

# 英文情感分析示例
english_text <- "I love learning R programming. It is powerful and interesting."
english_words <- unlist(strsplit(tolower(english_text), "\\W+"))

# 使用AFINN詞典(數值評分)
afinn_scores <- afinn_dict$value[match(english_words, afinn_dict$word)]
total_score <- sum(afinn_scores, na.rm = TRUE)

# 使用BING詞典(積極/消極)
bing_sentiments <- bing_dict$sentiment[match(english_words, bing_dict$word)]
sentiment_counts <- table(bing_sentiments[!is.na(bing_sentiments)])

cat("AFINN總評分:", total_score, "\n")
cat("BING情感統計:\n")
print(sentiment_counts)

3.2 情感趨勢分析

# 創建時間序列文本數據(模擬數據)
set.seed(123)
time_series_text <- data.frame(
  date = seq.Date(as.Date("2023-01-01"), as.Date("2023-01-31"), by = "day"),
  text = c(
    "今天很開心,項目進展順利。",
    "遇到了一些技術難題,需要研究。",
    "團隊合作很愉快,大家都很努力。",
    "客户反饋不錯,繼續加油。",
    "系統出現了一些問題,需要修復。",
    "學習新知識很有收穫。",
    "工作壓力有點大,需要調整。",
    "完成了一個重要里程碑,值得慶祝。",
    "遇到了挑戰,但能夠克服。",
    "收到了積極的評價,很開心。",
    "技術問題還沒有解決,有些困擾。",
    "團隊氛圍很好,工作效率高。",
    "項目延期了,需要加快進度。",
    "學到了很多新技能,很有成就感。",
    "工作與生活需要更好的平衡。",
    "取得了不錯的成果,繼續努力。",
    "遇到了一些挫折,但不會放棄。",
    "收到了建設性的反饋,有助於改進。",
    "工作節奏太快,需要休息。",
    "看到自己的進步,很欣慰。",
    "技術更新很快,需要不斷學習。",
    "團隊溝通很順暢,合作愉快。",
    "工作量有些大,需要合理安排。",
    "解決了長期困擾的問題,很有成就感。",
    "收到了客户的感謝,很感動。",
    "遇到了一些困難,但相信能解決。",
    "工作環境很舒適,有利於專注。",
    "時間管理需要改進。",
    "完成了既定目標,值得慶祝。",
    "面臨新的挑戰,準備迎接。",
    "整體來説,這個月收穫很多。"
  ),
  stringsAsFactors = FALSE
)

# 情感分析
time_series_text$sentiment_score <- sapply(time_series_text$text, function(text) {
  result <- simple_sentiment_analysis(text, positive_chinese, negative_chinese)
  return(result$score)
})

# 情感分類
time_series_text$sentiment_category <- ifelse(
  time_series_text$sentiment_score > 0, "積極",
  ifelse(time_series_text$sentiment_score < 0, "消極", "中性")
)

# 查看結果
head(time_series_text)

# 情感趨勢可視化
library(ggplot2)

# 每日情感得分趨勢
ggplot(time_series_text, aes(x = date, y = sentiment_score)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(color = "darkblue", size = 2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
  labs(
    title = "情感得分趨勢",
    x = "日期",
    y = "情感得分"
  ) +
  theme_minimal()

# 情感類別分佈
sentiment_summary <- time_series_text %>%
  group_by(sentiment_category) %>%
  summarize(count = n(), percentage = n() / nrow(time_series_text) * 100)

ggplot(sentiment_summary, aes(x = "", y = count, fill = sentiment_category)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), 
            position = position_stack(vjust = 0.5)) +
  labs(
    title = "情感類別分佈",
    fill = "情感類別"
  ) +
  theme_void()

# 移動平均平滑情感趨勢
library(zoo)

time_series_text$sentiment_ma <- rollmean(
  time_series_text$sentiment_score, 
  k = 7, 
  fill = NA,
  align = "right"
)

# 繪製原始得分和移動平均
ggplot(time_series_text, aes(x = date)) +
  geom_line(aes(y = sentiment_score), color = "lightblue", alpha = 0.5, size = 0.8) +
  geom_line(aes(y = sentiment_ma), color = "darkblue", size = 1.2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
  labs(
    title = "情感得分趨勢(原始與7日移動平均)",
    x = "日期",
    y = "情感得分"
  ) +
  theme_minimal()

3.3 高級情感分析

# 使用機器學習進行情感分析
# 準備訓練數據
# 這裏使用模擬數據,實際應用需要標註好的情感數據

set.seed(123)
n_samples <- 100

train_data <- data.frame(
  text = c(
    # 積極樣本
    "這個產品非常好用,我很喜歡。",
    "服務質量優秀,非常滿意。",
    "體驗很棒,下次還會再來。",
    "效果超出預期,值得推薦。",
    "操作簡單,功能強大。",
    "性價比很高,物超所值。",
    "設計精美,質量上乘。",
    "客服態度很好,解決問題迅速。",
    "使用方便,體驗流暢。",
    "非常實用,幫助很大。",
    
    # 消極樣本
    "產品質量很差,很失望。",
    "服務態度惡劣,不滿意。",
    "效果不好,不推薦購買。",
    "操作複雜,很難使用。",
    "價格太貴,不值這個價。",
    "設計不合理,使用不便。",
    "客服迴應慢,問題沒解決。",
    "功能有限,不夠實用。",
    "存在缺陷,需要改進。",
    "體驗糟糕,不會再來。"
  ),
  sentiment = c(rep("積極", 10), rep("消極", 10)),
  stringsAsFactors = FALSE
)

# 重複數據以增加樣本量
train_data <- train_data[rep(1:nrow(train_data), 5), ]
rownames(train_data) <- NULL

# 添加更多樣本
additional_texts <- c(
  "非常喜歡,強烈推薦!",
  "太棒了,超出預期!",
  "完美,無可挑剔!",
  "糟糕透頂,再也不買了!",
  "太差了,浪費時間!",
  "一般般,沒什麼特別的。",
  "還行,可以接受。",
  "馬馬虎虎,勉強能用。"
)

additional_sentiments <- c("積極", "積極", "積極", "消極", "消極", "中性", "中性", "中性")

train_data <- rbind(
  train_data,
  data.frame(
    text = additional_texts,
    sentiment = additional_sentiments,
    stringsAsFactors = FALSE
  )
)

# 文本預處理
preprocess_text <- function(text) {
  text <- tolower(text)
  text <- gsub("[[:punct:]]", " ", text)
  text <- gsub("\\s+", " ", text)
  text <- trimws(text)
  return(text)
}

train_data$clean_text <- sapply(train_data$text, preprocess_text)

# 創建文檔-詞項矩陣
library(tm)

# 創建語料庫
corpus <- VCorpus(VectorSource(train_data$clean_text))

# 創建文檔-詞項矩陣
dtm <- DocumentTermMatrix(corpus, control = list(
  weighting = weightTf,
  stopwords = TRUE,
  minWordLength = 2,
  removeNumbers = TRUE,
  removePunctuation = TRUE
))

# 轉換為數據框
dtm_df <- as.data.frame(as.matrix(dtm))

# 添加情感標籤
dtm_df$sentiment <- as.factor(train_data$sentiment)

# 拆分訓練集和測試集
set.seed(123)
train_index <- sample(1:nrow(dtm_df), 0.7 * nrow(dtm_df))
train_set <- dtm_df[train_index, ]
test_set <- dtm_df[-train_index, ]

# 訓練樸素貝葉斯分類器
library(e1071)

nb_model <- naiveBayes(sentiment ~ ., data = train_set)

# 預測
predictions <- predict(nb_model, test_set)

# 評估模型
confusion_matrix <- table(Predicted = predictions, Actual = test_set$sentiment)
print(confusion_matrix)

accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("準確率:", round(accuracy * 100, 2), "%\n")

# 使用新文本進行預測
new_texts <- c(
  "這個產品真的很不錯,我很滿意。",
  "質量太差了,非常失望。",
  "一般般,沒什麼感覺。"
)

predict_sentiment <- function(texts, model, corpus_reference) {
  # 預處理
  clean_texts <- sapply(texts, preprocess_text)
  
  # 創建新語料庫
  new_corpus <- VCorpus(VectorSource(clean_texts))
  
  # 使用相同的控制參數創建DTM
  new_dtm <- DocumentTermMatrix(new_corpus, control = list(
    dictionary = Terms(corpus_reference),
    weighting = weightTf,
    stopwords = TRUE,
    minWordLength = 2,
    removeNumbers = TRUE,
    removePunctuation = TRUE
  ))
  
  new_dtm_df <- as.data.frame(as.matrix(new_dtm))
  
  # 預測
  predictions <- predict(model, new_dtm_df)
  
  return(data.frame(
    text = texts,
    predicted_sentiment = predictions,
    stringsAsFactors = FALSE
  ))
}

# 測試預測
results <- predict_sentiment(new_texts, nb_model, corpus)
print(results)

🧩 第四部分:主題建模

4.1 LDA主題模型基礎

# LDA(Latent Dirichlet Allocation)主題建模
# 準備數據
library(topicmodels)
library(tidytext)

# 創建更大的文本數據集
set.seed(123)
n_docs <- 50

# 創建三個主題的文檔
topic1_docs <- replicate(20, paste(
  sample(c("數據", "分析", "統計", "模型", "算法", "預測", "學習", "訓練"), 8, replace = TRUE),
  collapse = " "
))

topic2_docs <- replicate(15, paste(
  sample(c("文本", "挖掘", "語言", "處理", "情感", "分析", "分類", "聚類"), 8, replace = TRUE),
  collapse = " "
))

topic3_docs <- replicate(15, paste(
  sample(c("可視", "圖表", "圖形", "展示", "交互", "設計", "顏色", "佈局"), 8, replace = TRUE),
  collapse = " "
))

all_docs <- c(topic1_docs, topic2_docs, topic3_docs)

# 創建文檔數據框
lda_data <- data.frame(
  doc_id = 1:n_docs,
  text = all_docs,
  true_topic = c(rep("數據分析", 20), rep("文本挖掘", 15), rep("數據可視化", 15)),
  stringsAsFactors = FALSE
)

# 預處理:分詞
lda_data$words <- lapply(lda_data$text, function(x) {
  words <- unlist(strsplit(x, " "))
  words[!words %in% chinese_stopwords]
})

# 創建文檔-詞項矩陣
# 首先創建詞頻數據框
word_freq_list <- list()

for (i in 1:nrow(lda_data)) {
  words <- lda_data$words[[i]]
  word_counts <- table(words)
  
  for (word in names(word_counts)) {
    word_freq_list[[length(word_freq_list) + 1]] <- data.frame(
      doc_id = i,
      word = word,
      freq = as.numeric(word_counts[word])
    )
  }
}

word_freq_df <- do.call(rbind, word_freq_list)

# 創建文檔-詞項矩陣
dtm_lda <- word_freq_df %>%
  cast_dtm(doc_id, word, freq)

# 查看DTM
inspect(dtm_lda[1:5, 1:10])
dim(dtm_lda)

# 運行LDA模型
set.seed(123)
lda_model <- LDA(dtm_lda, k = 3, control = list(seed = 123))

# 查看模型
lda_model

# 提取主題-詞語分佈
topic_word_dist <- tidy(lda_model, matrix = "beta")

# 查看每個主題最重要的10個詞
top_terms <- topic_word_dist %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

print(top_terms)

# 可視化主題-詞語分佈
library(ggplot2)

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  labs(
    title = "LDA主題模型 - 主題詞語分佈",
    x = "詞語",
    y = "概率(beta)"
  ) +
  theme_minimal()

4.2 文檔-主題分佈

# 提取文檔-主題分佈
doc_topic_dist <- tidy(lda_model, matrix = "gamma")

# 查看文檔主題分佈
head(doc_topic_dist)

# 為每個文檔分配主要主題
doc_classifications <- doc_topic_dist %>%
  group_by(document) %>%
  top_n(1, gamma) %>%
  ungroup()

# 查看文檔分類
doc_classifications <- doc_classifications %>%
  mutate(
    document = as.integer(document),
    predicted_topic = paste("主題", topic)
  )

# 與真實主題比較
comparison <- lda_data %>%
  select(doc_id, true_topic) %>%
  left_join(doc_classifications, by = c("doc_id" = "document"))

# 查看比較結果
head(comparison)

# 計算準確率
comparison$correct <- comparison$true_topic == comparison$predicted_topic
accuracy <- mean(comparison$correct)
cat("主題分類準確率:", round(accuracy * 100, 2), "%\n")

# 可視化文檔-主題分佈
# 選擇部分文檔進行可視化
sample_docs <- sample(1:n_docs, 10)

doc_topic_dist %>%
  filter(as.integer(document) %in% sample_docs) %>%
  mutate(document = factor(document, levels = as.character(sample_docs))) %>%
  ggplot(aes(factor(topic), gamma, fill = factor(topic))) +
  geom_col() +
  facet_wrap(~ document, ncol = 5) +
  labs(
    title = "文檔-主題分佈(示例文檔)",
    x = "主題",
    y = "概率(gamma)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

4.3 主題模型優化與評估

# 尋找最佳主題數
# 使用困惑度(perplexity)評估模型

# 創建不同主題數的模型
topic_numbers <- c(2, 3, 4, 5, 6, 7, 8)
perplexity_values <- numeric(length(topic_numbers))

set.seed(123)
for (i in seq_along(topic_numbers)) {
  k <- topic_numbers[i]
  cat("訓練主題數 k =", k, "的模型...\n")
  
  lda_temp <- LDA(dtm_lda, k = k, control = list(seed = 123))
  perplexity_values[i] <- perplexity(lda_temp, dtm_lda)
}

# 創建結果數據框
perplexity_df <- data.frame(
  k = topic_numbers,
  perplexity = perplexity_values
)

# 可視化困惑度
ggplot(perplexity_df, aes(x = k, y = perplexity)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(color = "darkblue", size = 3) +
  labs(
    title = "LDA模型困惑度 vs 主題數",
    x = "主題數(k)",
    y = "困惑度"
  ) +
  theme_minimal()

# 使用最佳主題數重新訓練模型
best_k <- topic_numbers[which.min(perplexity_values)]
cat("最佳主題數:", best_k, "\n")

set.seed(123)
best_lda_model <- LDA(dtm_lda, k = best_k, control = list(seed = 123))

# 提取主題
best_topic_terms <- tidy(best_lda_model, matrix = "beta") %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# 可視化最佳模型的主題
best_topic_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  labs(
    title =