📚 今日目標

  1. 理解機器學習基本概念和分類
  2. 掌握數據預處理與特徵工程
  3. 學習監督學習基礎算法
  4. 瞭解無監督學習方法
  5. 掌握模型評估與驗證技術

🤖 第一部分:機器學習基礎

1.1 機器學習概述

# 機器學習是讓計算機從數據中學習規律的方法
# 主要類型:
# 1. 監督學習:有標籤數據
# 2. 無監督學習:無標籤數據
# 3. 半監督學習:部分有標籤
# 4. 強化學習:通過獎勵學習

# 安裝必要的機器學習包
install.packages(c("caret", "randomForest", "e1071", "glmnet", "rpart", 
                   "xgboost", "cluster", "factoextra", "kknn", "kernlab"))

# 加載包
library(caret)          # 分類和迴歸訓練的統一接口
library(randomForest)   # 隨機森林
library(e1071)          # SVM和其他算法
library(glmnet)         # 正則化迴歸
library(rpart)          # 決策樹
library(xgboost)        # 梯度提升樹
library(cluster)        # 聚類分析
library(factoextra)     # 聚類可視化

1.2 機器學習工作流程

# 典型的機器學習工作流程:
# 1. 數據收集與探索
# 2. 數據預處理
# 3. 特徵工程
# 4. 模型選擇與訓練
# 5. 模型評估
# 6. 模型調優
# 7. 模型部署

# 我們將使用幾個經典數據集進行學習
# 分類問題:鳶尾花數據集
data(iris)

# 迴歸問題:波士頓房價數據集(需要從MASS包加載)
install.packages("MASS")
library(MASS)
data(Boston)

# 聚類問題:鳶尾花數據集(無監督)

🧹 第二部分:數據預處理與特徵工程

2.1 數據分割

# 數據分割:訓練集、驗證集、測試集
# 設置隨機種子確保可重複性
set.seed(123)

# 方法1:使用caret包的createDataPartition
library(caret)

# 分類問題的分層抽樣(保持類別比例)
train_index <- createDataPartition(iris$Species, 
                                   p = 0.7,      # 70%訓練集
                                   list = FALSE, # 返回向量而不是列表
                                   times = 1)    # 只劃分一次

train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]

# 檢查分佈
cat("訓練集類別分佈:\n")
print(table(train_data$Species))
cat("\n測試集類別分佈:\n")
print(table(test_data$Species))

# 方法2:迴歸問題的隨機抽樣
train_index_boston <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_boston <- Boston[train_index_boston, ]
test_boston <- Boston[-train_index_boston, ]

# 方法3:使用caret的trainControl進行交叉驗證分割
cv_folds <- createFolds(iris$Species, k = 5, returnTrain = TRUE)

2.2 數據預處理

# 處理缺失值
# 創建有缺失值的數據示例
data_with_na <- iris
data_with_na[sample(1:nrow(iris), 10), "Sepal.Length"] <- NA
data_with_na[sample(1:nrow(iris), 5), "Petal.Width"] <- NA

# 檢查缺失值
colSums(is.na(data_with_na))

# 方法1:刪除缺失值
data_complete <- na.omit(data_with_na)

# 方法2:使用均值/中位數填補
data_imputed <- data_with_na
for(col in names(data_imputed)) {
  if(is.numeric(data_imputed[[col]])) {
    data_imputed[[col]][is.na(data_imputed[[col]])] <- 
      median(data_imputed[[col]], na.rm = TRUE)
  }
}

# 方法3:使用mice包多重插補(更高級)
install.packages("mice")
library(mice)

# 標準化/歸一化
# 方法1:手動標準化 (z-score)
standardize <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

iris_scaled <- iris
iris_scaled[, 1:4] <- apply(iris_scaled[, 1:4], 2, standardize)

# 方法2:使用caret的preProcess函數
preprocess_params <- preProcess(iris[, 1:4], method = c("center", "scale"))
iris_processed <- predict(preprocess_params, iris[, 1:4])

# 檢查標準化結果
summary(iris_scaled[, 1:4])
summary(iris_processed)

# 處理類別變量
# 創建虛擬變量(獨熱編碼)
dummy_vars <- model.matrix(~ Species - 1, data = iris)
head(dummy_vars)

# 使用caret的dummyVars函數
dummy_model <- dummyVars(~ Species, data = iris, fullRank = TRUE)
iris_dummy <- predict(dummy_model, iris)
head(iris_dummy)

2.3 特徵工程

# 特徵工程:創建新特徵、選擇重要特徵
# 1. 創建交互特徵
iris$Sepal.Area <- iris$Sepal.Length * iris$Sepal.Width
iris$Petal.Area <- iris$Petal.Length * iris$Petal.Width
iris$Length.Ratio <- iris$Petal.Length / iris$Sepal.Length

# 2. 多項式特徵
iris$Sepal.Length.sq <- iris$Sepal.Length^2
iris$Sepal.Length.cu <- iris$Sepal.Length^3

# 3. 分箱處理(連續變量離散化)
iris$Sepal.Length.bin <- cut(iris$Sepal.Length, 
                             breaks = quantile(iris$Sepal.Length, 
                                             probs = seq(0, 1, 0.25)),
                             labels = c("Q1", "Q2", "Q3", "Q4"))

# 4. 特徵選擇
# 方法1:基於相關性的特徵選擇
cor_matrix <- cor(iris[, 1:4])
print(cor_matrix)

# 方法2:使用隨機森林計算特徵重要性
rf_model <- randomForest(Species ~ ., data = iris, importance = TRUE)
importance_scores <- importance(rf_model)
print(importance_scores)

# 可視化特徵重要性
varImpPlot(rf_model)

# 方法3:使用遞歸特徵消除(RFE)
library(caret)

# 定義控制參數
rfe_control <- rfeControl(functions = rfFuncs,     # 使用隨機森林
                         method = "cv",           # 交叉驗證
                         number = 5,              # 5折交叉驗證
                         verbose = FALSE)

# 執行RFE
rfe_result <- rfe(iris[, 1:4], iris$Species,
                  sizes = c(1:4),          # 測試不同特徵數量
                  rfeControl = rfe_control)

print(rfe_result)
plot(rfe_result)

📈 第三部分:監督學習算法

3.1 邏輯迴歸(分類)

# 邏輯迴歸用於二分類和多分類問題
# 準備二分類數據
iris_binary <- iris
iris_binary$Species_binary <- ifelse(iris$Species == "setosa", 1, 0)

# 數據分割
set.seed(123)
train_index <- createDataPartition(iris_binary$Species_binary, p = 0.7, list = FALSE)
train_data <- iris_binary[train_index, ]
test_data <- iris_binary[-train_index, ]

# 訓練邏輯迴歸模型
logistic_model <- glm(Species_binary ~ Sepal.Length + Sepal.Width + 
                      Petal.Length + Petal.Width,
                    data = train_data,
                    family = binomial(link = "logit"))

summary(logistic_model)

# 預測
test_predictions <- predict(logistic_model, newdata = test_data, type = "response")
test_predictions_class <- ifelse(test_predictions > 0.5, 1, 0)

# 評估
confusion_matrix <- table(Predicted = test_predictions_class, 
                         Actual = test_data$Species_binary)
print(confusion_matrix)

# 計算性能指標
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
f1_score <- 2 * (precision * recall) / (precision + recall)

cat(sprintf("準確率: %.2f%%\n", accuracy * 100))
cat(sprintf("精確率: %.2f%%\n", precision * 100))
cat(sprintf("召回率: %.2f%%\n", recall * 100))
cat(sprintf("F1分數: %.2f\n", f1_score))

# ROC曲線和AUC
install.packages("pROC")
library(pROC)

roc_curve <- roc(test_data$Species_binary, test_predictions)
plot(roc_curve, main = "ROC曲線")
auc_value <- auc(roc_curve)
cat(sprintf("AUC值: %.3f\n", auc_value))

3.2 決策樹

# 決策樹分類
# 使用rpart包
library(rpart)
library(rpart.plot)

# 訓練決策樹模型
tree_model <- rpart(Species ~ ., 
                    data = train_data,
                    method = "class",  # 分類任務
                    control = rpart.control(
                      minsplit = 10,   # 節點最小樣本數
                      minbucket = 5,   # 葉節點最小樣本數
                      maxdepth = 5,    # 最大深度
                      cp = 0.01        # 複雜度參數
                    ))

# 可視化決策樹
rpart.plot(tree_model, 
           type = 2, 
           extra = 104, 
           fallen.leaves = TRUE,
           main = "鳶尾花分類決策樹")

# 查看變量重要性
print(tree_model$variable.importance)

# 預測
tree_predictions <- predict(tree_model, test_data, type = "class")

# 評估
tree_cm <- table(Predicted = tree_predictions, Actual = test_data$Species)
print(tree_cm)

# 決策樹迴歸(波士頓房價數據)
set.seed(123)
boston_index <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_boston <- Boston[boston_index, ]
test_boston <- Boston[-boston_index, ]

tree_reg_model <- rpart(medv ~ ., 
                       data = train_boston,
                       method = "anova",  # 迴歸任務
                       control = rpart.control(
                         minsplit = 20,
                         minbucket = 10,
                         maxdepth = 5,
                         cp = 0.01
                       ))

# 預測和評估迴歸模型
tree_reg_pred <- predict(tree_reg_model, test_boston)

# 計算迴歸指標
mae <- mean(abs(tree_reg_pred - test_boston$medv))
rmse <- sqrt(mean((tree_reg_pred - test_boston$medv)^2))
r_squared <- 1 - sum((test_boston$medv - tree_reg_pred)^2) / 
  sum((test_boston$medv - mean(test_boston$medv))^2)

cat(sprintf("MAE: %.3f\n", mae))
cat(sprintf("RMSE: %.3f\n", rmse))
cat(sprintf("R²: %.3f\n", r_squared))

3.3 隨機森林

# 隨機森林:集成多個決策樹
# 分類問題
rf_class_model <- randomForest(Species ~ .,
                              data = train_data,
                              ntree = 100,      # 樹的數量
                              mtry = 2,         # 每棵樹使用的特徵數
                              importance = TRUE,
                              proximity = TRUE)

print(rf_class_model)

# 變量重要性
importance(rf_class_model)
varImpPlot(rf_class_model, main = "隨機森林特徵重要性")

# 預測
rf_predictions <- predict(rf_class_model, test_data)

# 評估
rf_cm <- table(Predicted = rf_predictions, Actual = test_data$Species)
print(rf_cm)
cat("準確率:", sum(diag(rf_cm)) / sum(rf_cm), "\n")

# 迴歸問題
rf_reg_model <- randomForest(medv ~ .,
                            data = train_boston,
                            ntree = 200,
                            mtry = 4,
                            importance = TRUE)

# 預測和評估
rf_reg_pred <- predict(rf_reg_model, test_boston)

rf_mae <- mean(abs(rf_reg_pred - test_boston$medv))
rf_rmse <- sqrt(mean((rf_reg_pred - test_boston$medv)^2))
rf_r2 <- 1 - sum((test_boston$medv - rf_reg_pred)^2) / 
  sum((test_boston$medv - mean(test_boston$medv))^2)

cat(sprintf("隨機森林迴歸 - MAE: %.3f, RMSE: %.3f, R²: %.3f\n", 
            rf_mae, rf_rmse, rf_r2))

3.4 支持向量機(SVM)

# 支持向量機分類
library(e1071)

# 線性SVM
svm_linear_model <- svm(Species ~ .,
                        data = train_data,
                        kernel = "linear",
                        cost = 1,        # 懲罰參數
                        scale = TRUE)    # 標準化數據

# 預測
svm_linear_pred <- predict(svm_linear_model, test_data)

# 評估
svm_linear_cm <- table(Predicted = svm_linear_pred, Actual = test_data$Species)
print(svm_linear_cm)

# 徑向基核SVM(非線性)
svm_rbf_model <- svm(Species ~ .,
                     data = train_data,
                     kernel = "radial",
                     cost = 1,
                     gamma = 0.1,       # 核參數
                     scale = TRUE)

svm_rbf_pred <- predict(svm_rbf_model, test_data)
svm_rbf_cm <- table(Predicted = svm_rbf_pred, Actual = test_data$Species)
print(svm_rbf_cm)

# 交叉驗證調參
tune_result <- tune(svm, Species ~ ., data = train_data,
                    ranges = list(
                      cost = c(0.1, 1, 10, 100),
                      gamma = c(0.01, 0.1, 1, 10)
                    ),
                    kernel = "radial")

print(tune_result)
plot(tune_result)

# 使用最佳參數
best_svm_model <- tune_result$best.model
best_svm_pred <- predict(best_svm_model, test_data)
best_svm_cm <- table(Predicted = best_svm_pred, Actual = test_data$Species)
print(best_svm_cm)

3.5 梯度提升樹(XGBoost)

# XGBoost:高效的梯度提升實現
library(xgboost)

# 準備數據(轉換為矩陣格式)
x_train <- as.matrix(train_data[, 1:4])
y_train <- as.numeric(train_data$Species) - 1  # 轉為0,1,2

x_test <- as.matrix(test_data[, 1:4])
y_test <- as.numeric(test_data$Species) - 1

# 設置參數
params <- list(
  objective = "multi:softprob",  # 多分類
  num_class = 3,                 # 類別數
  eta = 0.3,                     # 學習率
  max_depth = 6,                 # 樹的最大深度
  min_child_weight = 1,
  subsample = 0.8,               # 樣本抽樣比例
  colsample_bytree = 0.8,        # 特徵抽樣比例
  eval_metric = "mlogloss"       # 評估指標
)

# 訓練模型
xgb_model <- xgboost(
  data = x_train,
  label = y_train,
  params = params,
  nrounds = 100,                 # 迭代次數
  verbose = 0                    # 不顯示訓練過程
)

# 特徵重要性
importance_matrix <- xgb.importance(model = xgb_model)
print(importance_matrix)
xgb.plot.importance(importance_matrix)

# 預測
xgb_pred_prob <- predict(xgb_model, x_test, reshape = TRUE)
xgb_pred_class <- max.col(xgb_pred_prob) - 1  # 轉為0,1,2

# 評估
xgb_cm <- table(Predicted = xgb_pred_class, Actual = y_test)
print(xgb_cm)
cat("準確率:", sum(diag(xgb_cm)) / sum(xgb_cm), "\n")

🌀 第四部分:無監督學習

4.1 K-means聚類

# K-means聚類
# 使用鳶尾花數據(忽略標籤)
iris_features <- iris[, 1:4]

# 確定最佳K值(肘部法則)
wss <- numeric(10)  # 保存每個K值的組內平方和
for (k in 1:10) {
  kmeans_result <- kmeans(iris_features, centers = k, nstart = 25)
  wss[k] <- kmeans_result$tot.withinss
}

# 繪製肘部圖
plot(1:10, wss, type = "b", 
     xlab = "聚類數量 (K)", 
     ylab = "組內平方和",
     main = "肘部法則確定最佳K值")

# 從圖中看出肘部在K=3處
kmeans_model <- kmeans(iris_features, centers = 3, nstart = 25)

# 查看聚類結果
print(kmeans_model)

# 聚類中心
print(kmeans_model$centers)

# 聚類大小
print(kmeans_model$size)

# 可視化聚類結果
library(factoextra)
fviz_cluster(kmeans_model, data = iris_features,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             geom = "point",
             ellipse.type = "convex",
             ggtheme = theme_minimal())

# 與實際標籤比較
comparison <- data.frame(
  Actual = iris$Species,
  Cluster = kmeans_model$cluster
)
table(comparison)

4.2 層次聚類

# 層次聚類
# 計算距離矩陣
dist_matrix <- dist(iris_features, method = "euclidean")

# 層次聚類(使用ward.D2方法)
hclust_model <- hclust(dist_matrix, method = "ward.D2")

# 繪製樹狀圖
plot(hclust_model, 
     main = "鳶尾花數據層次聚類樹狀圖",
     xlab = "樣本",
     ylab = "距離",
     cex = 0.6)

# 添加切割線(分為3類)
rect.hclust(hclust_model, k = 3, border = 2:4)

# 獲取聚類結果
hclust_clusters <- cutree(hclust_model, k = 3)

# 可視化
fviz_dend(hclust_model, k = 3,
          cex = 0.5,
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE,
          rect = TRUE)

4.3 PCA降維

# 主成分分析(PCA)
pca_result <- prcomp(iris_features, scale = TRUE, center = TRUE)

# 查看結果
summary(pca_result)

# 碎石圖(確定重要成分)
fviz_eig(pca_result, addlabels = TRUE, 
         ylim = c(0, 100),
         main = "PCA - 方差解釋比例")

# 查看主成分載荷
print(pca_result$rotation[, 1:2])

# 可視化PCA結果
fviz_pca_ind(pca_result,
             col.ind = iris$Species,
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE,
             ellipse.type = "confidence",
             legend.title = "Species",
             repel = TRUE)

# 變量貢獻圖
fviz_pca_var(pca_result,
             col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

# 雙標圖
fviz_pca_biplot(pca_result,
                col.ind = iris$Species,
                palette = c("#00AFBB", "#E7B800", "#FC4E07"),
                addEllipses = TRUE,
                ellipse.type = "confidence",
                legend.title = "Species",
                repel = TRUE)

📊 第五部分:模型評估與驗證

5.1 交叉驗證

# K折交叉驗證
library(caret)

# 設置訓練控制參數
train_control <- trainControl(
  method = "cv",           # 交叉驗證
  number = 10,             # 10折交叉驗證
  savePredictions = TRUE,  # 保存預測結果
  classProbs = TRUE,       # 保存概率
  summaryFunction = multiClassSummary  # 多分類評估指標
)

# 訓練隨機森林模型(使用交叉驗證)
rf_cv_model <- train(Species ~ .,
                    data = iris,
                    method = "rf",      # 隨機森林
                    trControl = train_control,
                    tuneLength = 3,     # 嘗試3組參數
                    metric = "Accuracy")

print(rf_cv_model)
plot(rf_cv_model)

# 查看交叉驗證結果
cv_results <- rf_cv_model$results
print(cv_results)

# 重複交叉驗證
train_control_repeated <- trainControl(
  method = "repeatedcv",   # 重複交叉驗證
  number = 5,              # 5折
  repeats = 3,             # 重複3次
  savePredictions = TRUE
)

# 訓練SVM模型
svm_cv_model <- train(Species ~ .,
                     data = iris,
                     method = "svmRadial",  # 徑向基SVM
                     trControl = train_control_repeated,
                     tuneLength = 5,
                     metric = "Accuracy")

print(svm_cv_model)

5.2 模型評估指標

# 分類問題評估
# 混淆矩陣的詳細分析
library(caret)

# 生成預測結果(使用之前的隨機森林模型)
rf_predictions <- predict(rf_class_model, test_data)

# 詳細評估
confusion_matrix <- confusionMatrix(rf_predictions, test_data$Species)
print(confusion_matrix)

# 提取重要指標
cat("整體準確率:", confusion_matrix$overall["Accuracy"], "\n")
cat("Kappa統計量:", confusion_matrix$overall["Kappa"], "\n")

# 每個類別的指標
print(confusion_matrix$byClass)

# ROC曲線和AUC(多分類)
library(pROC)

# 計算每個類別的ROC
roc_list <- list()
for (i in 1:3) {
  actual_binary <- ifelse(test_data$Species == levels(test_data$Species)[i], 1, 0)
  roc_list[[i]] <- roc(actual_binary, 
                       as.numeric(rf_predictions == levels(test_data$Species)[i]))
}

# 繪製多條ROC曲線
plot(roc_list[[1]], col = "red", main = "多分類ROC曲線")
lines(roc_list[[2]], col = "blue")
lines(roc_list[[3]], col = "green")
legend("bottomright", 
       legend = levels(test_data$Species),
       col = c("red", "blue", "green"),
       lty = 1)

# 迴歸問題評估
# 計算多種迴歸指標
regression_metrics <- function(actual, predicted) {
  mae <- mean(abs(actual - predicted))
  mse <- mean((actual - predicted)^2)
  rmse <- sqrt(mse)
  mape <- mean(abs((actual - predicted) / actual)) * 100
  r2 <- 1 - sum((actual - predicted)^2) / sum((actual - mean(actual))^2)
  
  return(data.frame(
    MAE = mae,
    MSE = mse,
    RMSE = rmse,
    MAPE = mape,
    R_squared = r2
  ))
}

# 應用評估函數
rf_reg_metrics <- regression_metrics(test_boston$medv, rf_reg_pred)
print(rf_reg_metrics)

5.3 模型比較與選擇

# 比較多個模型
library(caret)

# 準備數據
set.seed(123)
train_index <- createDataPartition(iris$Species, p = 0.7, list = FALSE)
train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]

# 定義訓練控制
train_control <- trainControl(
  method = "cv",
  number = 5,
  savePredictions = TRUE,
  classProbs = TRUE
)

# 訓練多個模型
models <- list()

# 1. 決策樹
models[["決策樹"]] <- train(Species ~ .,
                           data = train_data,
                           method = "rpart",
                           trControl = train_control,
                           tuneLength = 5)

# 2. 隨機森林
models[["隨機森林"]] <- train(Species ~ .,
                             data = train_data,
                             method = "rf",
                             trControl = train_control,
                             tuneLength = 3)

# 3. SVM
models[["SVM"]] <- train(Species ~ .,
                        data = train_data,
                        method = "svmRadial",
                        trControl = train_control,
                        tuneLength = 5)

# 4. KNN
models[["KNN"]] <- train(Species ~ .,
                        data = train_data,
                        method = "knn",
                        trControl = train_control,
                        tuneLength = 10)

# 比較模型性能
results <- resamples(models)
summary(results)

# 可視化比較
bwplot(results, main = "模型性能比較")
dotplot(results, main = "模型性能比較")

# 模型性能表格
model_comparison <- data.frame(
  Model = names(models),
  Accuracy = sapply(models, function(m) max(m$results$Accuracy)),
  Kappa = sapply(models, function(m) max(m$results$Kappa))
)

print(model_comparison[order(-model_comparison$Accuracy), ])

# 在測試集上評估最佳模型
best_model <- models[["隨機森林"]]
best_predictions <- predict(best_model, test_data)
best_cm <- confusionMatrix(best_predictions, test_data$Species)
print(best_cm)

🏭 第六部分:實戰案例

案例1:銀行客户流失預測

# 模擬銀行客户數據
set.seed(123)
n_customers <- 1000

customer_data <- data.frame(
  customer_id = 1:n_customers,
  age = sample(18:80, n_customers, replace = TRUE),
  gender = sample(c("Male", "Female"), n_customers, replace = TRUE),
  income = round(rnorm(n_customers, mean = 50000, sd = 15000), 0),
  balance = round(rnorm(n_customers, mean = 10000, sd = 5000), 0),
  credit_score = sample(300:850, n_customers, replace = TRUE),
  tenure = sample(1:10, n_customers, replace = TRUE),
  num_products = sample(1:5, n_customers, replace = TRUE),
  has_credit_card = sample(c(0, 1), n_customers, replace = TRUE, prob = c(0.3, 0.7)),
  is_active_member = sample(c(0, 1), n_customers, replace = TRUE, prob = c(0.4, 0.6)),
  satisfaction_score = sample(1:5, n_customers, replace = TRUE)
)

# 創建流失標籤(基於特徵)
customer_data$churn <- with(customer_data, 
                           ifelse(
                             (age > 60 & satisfaction_score < 3) |
                             (balance < 5000 & income < 40000) |
                             (credit_score < 600 & num_products == 1),
                             1, 0))

# 查看流失率
cat("客户流失率:", mean(customer_data$churn) * 100, "%\n")

# 數據預處理
# 處理類別變量
customer_data$gender <- as.factor(customer_data$gender)
customer_data$has_credit_card <- as.factor(customer_data$has_credit_card)
customer_data$is_active_member <- as.factor(customer_data$is_active_member)
customer_data$churn <- as.factor(customer_data$churn)

# 數據分割
set.seed(123)
train_index <- createDataPartition(customer_data$churn, p = 0.7, list = FALSE)
train_data <- customer_data[train_index, ]
test_data <- customer_data[-train_index, ]

# 特徵選擇(移除ID)
features <- setdiff(names(train_data), c("customer_id", "churn"))

# 訓練隨機森林模型
rf_churn_model <- randomForest(
  churn ~ . - customer_id,
  data = train_data,
  ntree = 100,
  importance = TRUE,
  strata = train_data$churn,  # 分層抽樣處理不平衡
  sampsize = c("0" = 300, "1" = 300)  # 平衡樣本
)

# 特徵重要性
varImpPlot(rf_churn_model, main = "客户流失預測特徵重要性")

# 預測
churn_predictions <- predict(rf_churn_model, test_data, type = "prob")
churn_class <- ifelse(churn_predictions[, 2] > 0.5, 1, 0)

# 評估(處理不平衡數據)
confusion_matrix <- table(Predicted = churn_class, Actual = test_data$churn)
print(confusion_matrix)

# 計算精確率、召回率、F1分數
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
f1_score <- 2 * precision * recall / (precision + recall)

cat(sprintf("精確率: %.2f%%\n", precision * 100))
cat(sprintf("召回率: %.2f%%\n", recall * 100))
cat(sprintf("F1分數: %.2f\n", f1_score))

# ROC曲線
library(pROC)
roc_curve <- roc(as.numeric(test_data$churn) - 1, churn_predictions[, 2])
plot(roc_curve, main = "客户流失預測ROC曲線")
auc_value <- auc(roc_curve)
cat(sprintf("AUC值: %.3f\n", auc_value))

案例2:房價預測模型優化

# 波士頓房價預測優化
library(MASS)
data(Boston)

# 數據探索
cat("=== 波士頓房價數據探索 ===\n")
cat("數據維度:", dim(Boston), "\n")
cat("房價統計:\n")
print(summary(Boston$medv))

# 特徵工程
# 1. 創建新特徵
Boston$rooms_per_house <- Boston$rm / Boston$dis
Boston$tax_per_room <- Boston$tax / Boston$rm
Boston$age_squared <- Boston$age^2

# 2. 交互特徵
Boston$nox_indus <- Boston$nox * Boston$indus
Boston$rad_tax <- Boston$rad * Boston$tax

# 3. 對數轉換(處理偏態分佈)
Boston$log_crim <- log(Boston$crim + 1)  # 加1避免log(0)
Boston$log_lstat <- log(Boston$lstat + 1)

# 數據分割
set.seed(123)
train_index <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]

# 訓練多個迴歸模型
models <- list()

# 線性迴歸
lm_model <- lm(medv ~ ., data = train_data)
models[["線性迴歸"]] <- lm_model

# 嶺迴歸
library(glmnet)
x_train <- model.matrix(medv ~ ., train_data)[, -1]
y_train <- train_data$medv

cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)  # alpha=0為嶺迴歸
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = cv_ridge$lambda.min)
models[["嶺迴歸"]] <- ridge_model

# Lasso迴歸
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)  # alpha=1為Lasso
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = cv_lasso$lambda.min)
models[["Lasso迴歸"]] <- lasso_model

# 隨機森林迴歸
rf_model <- randomForest(medv ~ ., data = train_data, ntree = 200)
models[["隨機森林"]] <- rf_model

# XGBoost迴歸
library(xgboost)

# 準備數據
x_train_xgb <- as.matrix(train_data[, -which(names(train_data) == "medv")])
y_train_xgb <- train_data$medv

xgb_model <- xgboost(
  data = x_train_xgb,
  label = y_train_xgb,
  nrounds = 100,
  objective = "reg:squarederror",
  eta = 0.1,
  max_depth = 6,
  verbose = 0
)
models[["XGBoost"]] <- xgb_model

# 模型評估函數
evaluate_model <- function(model, model_name, test_data) {
  if (model_name %in% c("嶺迴歸", "Lasso迴歸")) {
    # glmnet模型
    x_test <- model.matrix(medv ~ ., test_data)[, -1]
    predictions <- predict(model, newx = x_test)
  } else if (model_name == "XGBoost") {
    # XGBoost模型
    x_test <- as.matrix(test_data[, -which(names(test_data) == "medv")])
    predictions <- predict(model, x_test)
  } else {
    # 其他模型
    predictions <- predict(model, test_data)
  }
  
  actual <- test_data$medv
  mae <- mean(abs(predictions - actual))
  rmse <- sqrt(mean((predictions - actual)^2))
  r2 <- 1 - sum((actual - predictions)^2) / sum((actual - mean(actual))^2)
  
  return(data.frame(
    Model = model_name,
    MAE = mae,
    RMSE = rmse,
    R2 = r2
  ))
}

# 比較所有模型
results <- data.frame()
for (model_name in names(models)) {
  model_result <- evaluate_model(models[[model_name]], model_name, test_data)
  results <- rbind(results, model_result)
}

# 排序並顯示結果
results <- results[order(results$RMSE), ]
print(results)

# 可視化比較
library(ggplot2)
ggplot(results, aes(x = reorder(Model, -RMSE), y = RMSE)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "模型性能比較 (