📚 今日目標
- 理解機器學習基本概念和分類
- 掌握數據預處理與特徵工程
- 學習監督學習基礎算法
- 瞭解無監督學習方法
- 掌握模型評估與驗證技術
🤖 第一部分:機器學習基礎
1.1 機器學習概述
# 機器學習是讓計算機從數據中學習規律的方法
# 主要類型:
# 1. 監督學習:有標籤數據
# 2. 無監督學習:無標籤數據
# 3. 半監督學習:部分有標籤
# 4. 強化學習:通過獎勵學習
# 安裝必要的機器學習包
install.packages(c("caret", "randomForest", "e1071", "glmnet", "rpart",
"xgboost", "cluster", "factoextra", "kknn", "kernlab"))
# 加載包
library(caret) # 分類和迴歸訓練的統一接口
library(randomForest) # 隨機森林
library(e1071) # SVM和其他算法
library(glmnet) # 正則化迴歸
library(rpart) # 決策樹
library(xgboost) # 梯度提升樹
library(cluster) # 聚類分析
library(factoextra) # 聚類可視化
1.2 機器學習工作流程
# 典型的機器學習工作流程:
# 1. 數據收集與探索
# 2. 數據預處理
# 3. 特徵工程
# 4. 模型選擇與訓練
# 5. 模型評估
# 6. 模型調優
# 7. 模型部署
# 我們將使用幾個經典數據集進行學習
# 分類問題:鳶尾花數據集
data(iris)
# 迴歸問題:波士頓房價數據集(需要從MASS包加載)
install.packages("MASS")
library(MASS)
data(Boston)
# 聚類問題:鳶尾花數據集(無監督)
🧹 第二部分:數據預處理與特徵工程
2.1 數據分割
# 數據分割:訓練集、驗證集、測試集
# 設置隨機種子確保可重複性
set.seed(123)
# 方法1:使用caret包的createDataPartition
library(caret)
# 分類問題的分層抽樣(保持類別比例)
train_index <- createDataPartition(iris$Species,
p = 0.7, # 70%訓練集
list = FALSE, # 返回向量而不是列表
times = 1) # 只劃分一次
train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]
# 檢查分佈
cat("訓練集類別分佈:\n")
print(table(train_data$Species))
cat("\n測試集類別分佈:\n")
print(table(test_data$Species))
# 方法2:迴歸問題的隨機抽樣
train_index_boston <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_boston <- Boston[train_index_boston, ]
test_boston <- Boston[-train_index_boston, ]
# 方法3:使用caret的trainControl進行交叉驗證分割
cv_folds <- createFolds(iris$Species, k = 5, returnTrain = TRUE)
2.2 數據預處理
# 處理缺失值
# 創建有缺失值的數據示例
data_with_na <- iris
data_with_na[sample(1:nrow(iris), 10), "Sepal.Length"] <- NA
data_with_na[sample(1:nrow(iris), 5), "Petal.Width"] <- NA
# 檢查缺失值
colSums(is.na(data_with_na))
# 方法1:刪除缺失值
data_complete <- na.omit(data_with_na)
# 方法2:使用均值/中位數填補
data_imputed <- data_with_na
for(col in names(data_imputed)) {
if(is.numeric(data_imputed[[col]])) {
data_imputed[[col]][is.na(data_imputed[[col]])] <-
median(data_imputed[[col]], na.rm = TRUE)
}
}
# 方法3:使用mice包多重插補(更高級)
install.packages("mice")
library(mice)
# 標準化/歸一化
# 方法1:手動標準化 (z-score)
standardize <- function(x) {
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
iris_scaled <- iris
iris_scaled[, 1:4] <- apply(iris_scaled[, 1:4], 2, standardize)
# 方法2:使用caret的preProcess函數
preprocess_params <- preProcess(iris[, 1:4], method = c("center", "scale"))
iris_processed <- predict(preprocess_params, iris[, 1:4])
# 檢查標準化結果
summary(iris_scaled[, 1:4])
summary(iris_processed)
# 處理類別變量
# 創建虛擬變量(獨熱編碼)
dummy_vars <- model.matrix(~ Species - 1, data = iris)
head(dummy_vars)
# 使用caret的dummyVars函數
dummy_model <- dummyVars(~ Species, data = iris, fullRank = TRUE)
iris_dummy <- predict(dummy_model, iris)
head(iris_dummy)
2.3 特徵工程
# 特徵工程:創建新特徵、選擇重要特徵
# 1. 創建交互特徵
iris$Sepal.Area <- iris$Sepal.Length * iris$Sepal.Width
iris$Petal.Area <- iris$Petal.Length * iris$Petal.Width
iris$Length.Ratio <- iris$Petal.Length / iris$Sepal.Length
# 2. 多項式特徵
iris$Sepal.Length.sq <- iris$Sepal.Length^2
iris$Sepal.Length.cu <- iris$Sepal.Length^3
# 3. 分箱處理(連續變量離散化)
iris$Sepal.Length.bin <- cut(iris$Sepal.Length,
breaks = quantile(iris$Sepal.Length,
probs = seq(0, 1, 0.25)),
labels = c("Q1", "Q2", "Q3", "Q4"))
# 4. 特徵選擇
# 方法1:基於相關性的特徵選擇
cor_matrix <- cor(iris[, 1:4])
print(cor_matrix)
# 方法2:使用隨機森林計算特徵重要性
rf_model <- randomForest(Species ~ ., data = iris, importance = TRUE)
importance_scores <- importance(rf_model)
print(importance_scores)
# 可視化特徵重要性
varImpPlot(rf_model)
# 方法3:使用遞歸特徵消除(RFE)
library(caret)
# 定義控制參數
rfe_control <- rfeControl(functions = rfFuncs, # 使用隨機森林
method = "cv", # 交叉驗證
number = 5, # 5折交叉驗證
verbose = FALSE)
# 執行RFE
rfe_result <- rfe(iris[, 1:4], iris$Species,
sizes = c(1:4), # 測試不同特徵數量
rfeControl = rfe_control)
print(rfe_result)
plot(rfe_result)
📈 第三部分:監督學習算法
3.1 邏輯迴歸(分類)
# 邏輯迴歸用於二分類和多分類問題
# 準備二分類數據
iris_binary <- iris
iris_binary$Species_binary <- ifelse(iris$Species == "setosa", 1, 0)
# 數據分割
set.seed(123)
train_index <- createDataPartition(iris_binary$Species_binary, p = 0.7, list = FALSE)
train_data <- iris_binary[train_index, ]
test_data <- iris_binary[-train_index, ]
# 訓練邏輯迴歸模型
logistic_model <- glm(Species_binary ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width,
data = train_data,
family = binomial(link = "logit"))
summary(logistic_model)
# 預測
test_predictions <- predict(logistic_model, newdata = test_data, type = "response")
test_predictions_class <- ifelse(test_predictions > 0.5, 1, 0)
# 評估
confusion_matrix <- table(Predicted = test_predictions_class,
Actual = test_data$Species_binary)
print(confusion_matrix)
# 計算性能指標
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
f1_score <- 2 * (precision * recall) / (precision + recall)
cat(sprintf("準確率: %.2f%%\n", accuracy * 100))
cat(sprintf("精確率: %.2f%%\n", precision * 100))
cat(sprintf("召回率: %.2f%%\n", recall * 100))
cat(sprintf("F1分數: %.2f\n", f1_score))
# ROC曲線和AUC
install.packages("pROC")
library(pROC)
roc_curve <- roc(test_data$Species_binary, test_predictions)
plot(roc_curve, main = "ROC曲線")
auc_value <- auc(roc_curve)
cat(sprintf("AUC值: %.3f\n", auc_value))
3.2 決策樹
# 決策樹分類
# 使用rpart包
library(rpart)
library(rpart.plot)
# 訓練決策樹模型
tree_model <- rpart(Species ~ .,
data = train_data,
method = "class", # 分類任務
control = rpart.control(
minsplit = 10, # 節點最小樣本數
minbucket = 5, # 葉節點最小樣本數
maxdepth = 5, # 最大深度
cp = 0.01 # 複雜度參數
))
# 可視化決策樹
rpart.plot(tree_model,
type = 2,
extra = 104,
fallen.leaves = TRUE,
main = "鳶尾花分類決策樹")
# 查看變量重要性
print(tree_model$variable.importance)
# 預測
tree_predictions <- predict(tree_model, test_data, type = "class")
# 評估
tree_cm <- table(Predicted = tree_predictions, Actual = test_data$Species)
print(tree_cm)
# 決策樹迴歸(波士頓房價數據)
set.seed(123)
boston_index <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_boston <- Boston[boston_index, ]
test_boston <- Boston[-boston_index, ]
tree_reg_model <- rpart(medv ~ .,
data = train_boston,
method = "anova", # 迴歸任務
control = rpart.control(
minsplit = 20,
minbucket = 10,
maxdepth = 5,
cp = 0.01
))
# 預測和評估迴歸模型
tree_reg_pred <- predict(tree_reg_model, test_boston)
# 計算迴歸指標
mae <- mean(abs(tree_reg_pred - test_boston$medv))
rmse <- sqrt(mean((tree_reg_pred - test_boston$medv)^2))
r_squared <- 1 - sum((test_boston$medv - tree_reg_pred)^2) /
sum((test_boston$medv - mean(test_boston$medv))^2)
cat(sprintf("MAE: %.3f\n", mae))
cat(sprintf("RMSE: %.3f\n", rmse))
cat(sprintf("R²: %.3f\n", r_squared))
3.3 隨機森林
# 隨機森林:集成多個決策樹
# 分類問題
rf_class_model <- randomForest(Species ~ .,
data = train_data,
ntree = 100, # 樹的數量
mtry = 2, # 每棵樹使用的特徵數
importance = TRUE,
proximity = TRUE)
print(rf_class_model)
# 變量重要性
importance(rf_class_model)
varImpPlot(rf_class_model, main = "隨機森林特徵重要性")
# 預測
rf_predictions <- predict(rf_class_model, test_data)
# 評估
rf_cm <- table(Predicted = rf_predictions, Actual = test_data$Species)
print(rf_cm)
cat("準確率:", sum(diag(rf_cm)) / sum(rf_cm), "\n")
# 迴歸問題
rf_reg_model <- randomForest(medv ~ .,
data = train_boston,
ntree = 200,
mtry = 4,
importance = TRUE)
# 預測和評估
rf_reg_pred <- predict(rf_reg_model, test_boston)
rf_mae <- mean(abs(rf_reg_pred - test_boston$medv))
rf_rmse <- sqrt(mean((rf_reg_pred - test_boston$medv)^2))
rf_r2 <- 1 - sum((test_boston$medv - rf_reg_pred)^2) /
sum((test_boston$medv - mean(test_boston$medv))^2)
cat(sprintf("隨機森林迴歸 - MAE: %.3f, RMSE: %.3f, R²: %.3f\n",
rf_mae, rf_rmse, rf_r2))
3.4 支持向量機(SVM)
# 支持向量機分類
library(e1071)
# 線性SVM
svm_linear_model <- svm(Species ~ .,
data = train_data,
kernel = "linear",
cost = 1, # 懲罰參數
scale = TRUE) # 標準化數據
# 預測
svm_linear_pred <- predict(svm_linear_model, test_data)
# 評估
svm_linear_cm <- table(Predicted = svm_linear_pred, Actual = test_data$Species)
print(svm_linear_cm)
# 徑向基核SVM(非線性)
svm_rbf_model <- svm(Species ~ .,
data = train_data,
kernel = "radial",
cost = 1,
gamma = 0.1, # 核參數
scale = TRUE)
svm_rbf_pred <- predict(svm_rbf_model, test_data)
svm_rbf_cm <- table(Predicted = svm_rbf_pred, Actual = test_data$Species)
print(svm_rbf_cm)
# 交叉驗證調參
tune_result <- tune(svm, Species ~ ., data = train_data,
ranges = list(
cost = c(0.1, 1, 10, 100),
gamma = c(0.01, 0.1, 1, 10)
),
kernel = "radial")
print(tune_result)
plot(tune_result)
# 使用最佳參數
best_svm_model <- tune_result$best.model
best_svm_pred <- predict(best_svm_model, test_data)
best_svm_cm <- table(Predicted = best_svm_pred, Actual = test_data$Species)
print(best_svm_cm)
3.5 梯度提升樹(XGBoost)
# XGBoost:高效的梯度提升實現
library(xgboost)
# 準備數據(轉換為矩陣格式)
x_train <- as.matrix(train_data[, 1:4])
y_train <- as.numeric(train_data$Species) - 1 # 轉為0,1,2
x_test <- as.matrix(test_data[, 1:4])
y_test <- as.numeric(test_data$Species) - 1
# 設置參數
params <- list(
objective = "multi:softprob", # 多分類
num_class = 3, # 類別數
eta = 0.3, # 學習率
max_depth = 6, # 樹的最大深度
min_child_weight = 1,
subsample = 0.8, # 樣本抽樣比例
colsample_bytree = 0.8, # 特徵抽樣比例
eval_metric = "mlogloss" # 評估指標
)
# 訓練模型
xgb_model <- xgboost(
data = x_train,
label = y_train,
params = params,
nrounds = 100, # 迭代次數
verbose = 0 # 不顯示訓練過程
)
# 特徵重要性
importance_matrix <- xgb.importance(model = xgb_model)
print(importance_matrix)
xgb.plot.importance(importance_matrix)
# 預測
xgb_pred_prob <- predict(xgb_model, x_test, reshape = TRUE)
xgb_pred_class <- max.col(xgb_pred_prob) - 1 # 轉為0,1,2
# 評估
xgb_cm <- table(Predicted = xgb_pred_class, Actual = y_test)
print(xgb_cm)
cat("準確率:", sum(diag(xgb_cm)) / sum(xgb_cm), "\n")
🌀 第四部分:無監督學習
4.1 K-means聚類
# K-means聚類
# 使用鳶尾花數據(忽略標籤)
iris_features <- iris[, 1:4]
# 確定最佳K值(肘部法則)
wss <- numeric(10) # 保存每個K值的組內平方和
for (k in 1:10) {
kmeans_result <- kmeans(iris_features, centers = k, nstart = 25)
wss[k] <- kmeans_result$tot.withinss
}
# 繪製肘部圖
plot(1:10, wss, type = "b",
xlab = "聚類數量 (K)",
ylab = "組內平方和",
main = "肘部法則確定最佳K值")
# 從圖中看出肘部在K=3處
kmeans_model <- kmeans(iris_features, centers = 3, nstart = 25)
# 查看聚類結果
print(kmeans_model)
# 聚類中心
print(kmeans_model$centers)
# 聚類大小
print(kmeans_model$size)
# 可視化聚類結果
library(factoextra)
fviz_cluster(kmeans_model, data = iris_features,
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
geom = "point",
ellipse.type = "convex",
ggtheme = theme_minimal())
# 與實際標籤比較
comparison <- data.frame(
Actual = iris$Species,
Cluster = kmeans_model$cluster
)
table(comparison)
4.2 層次聚類
# 層次聚類
# 計算距離矩陣
dist_matrix <- dist(iris_features, method = "euclidean")
# 層次聚類(使用ward.D2方法)
hclust_model <- hclust(dist_matrix, method = "ward.D2")
# 繪製樹狀圖
plot(hclust_model,
main = "鳶尾花數據層次聚類樹狀圖",
xlab = "樣本",
ylab = "距離",
cex = 0.6)
# 添加切割線(分為3類)
rect.hclust(hclust_model, k = 3, border = 2:4)
# 獲取聚類結果
hclust_clusters <- cutree(hclust_model, k = 3)
# 可視化
fviz_dend(hclust_model, k = 3,
cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
color_labels_by_k = TRUE,
rect = TRUE)
4.3 PCA降維
# 主成分分析(PCA)
pca_result <- prcomp(iris_features, scale = TRUE, center = TRUE)
# 查看結果
summary(pca_result)
# 碎石圖(確定重要成分)
fviz_eig(pca_result, addlabels = TRUE,
ylim = c(0, 100),
main = "PCA - 方差解釋比例")
# 查看主成分載荷
print(pca_result$rotation[, 1:2])
# 可視化PCA結果
fviz_pca_ind(pca_result,
col.ind = iris$Species,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE,
ellipse.type = "confidence",
legend.title = "Species",
repel = TRUE)
# 變量貢獻圖
fviz_pca_var(pca_result,
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
# 雙標圖
fviz_pca_biplot(pca_result,
col.ind = iris$Species,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE,
ellipse.type = "confidence",
legend.title = "Species",
repel = TRUE)
📊 第五部分:模型評估與驗證
5.1 交叉驗證
# K折交叉驗證
library(caret)
# 設置訓練控制參數
train_control <- trainControl(
method = "cv", # 交叉驗證
number = 10, # 10折交叉驗證
savePredictions = TRUE, # 保存預測結果
classProbs = TRUE, # 保存概率
summaryFunction = multiClassSummary # 多分類評估指標
)
# 訓練隨機森林模型(使用交叉驗證)
rf_cv_model <- train(Species ~ .,
data = iris,
method = "rf", # 隨機森林
trControl = train_control,
tuneLength = 3, # 嘗試3組參數
metric = "Accuracy")
print(rf_cv_model)
plot(rf_cv_model)
# 查看交叉驗證結果
cv_results <- rf_cv_model$results
print(cv_results)
# 重複交叉驗證
train_control_repeated <- trainControl(
method = "repeatedcv", # 重複交叉驗證
number = 5, # 5折
repeats = 3, # 重複3次
savePredictions = TRUE
)
# 訓練SVM模型
svm_cv_model <- train(Species ~ .,
data = iris,
method = "svmRadial", # 徑向基SVM
trControl = train_control_repeated,
tuneLength = 5,
metric = "Accuracy")
print(svm_cv_model)
5.2 模型評估指標
# 分類問題評估
# 混淆矩陣的詳細分析
library(caret)
# 生成預測結果(使用之前的隨機森林模型)
rf_predictions <- predict(rf_class_model, test_data)
# 詳細評估
confusion_matrix <- confusionMatrix(rf_predictions, test_data$Species)
print(confusion_matrix)
# 提取重要指標
cat("整體準確率:", confusion_matrix$overall["Accuracy"], "\n")
cat("Kappa統計量:", confusion_matrix$overall["Kappa"], "\n")
# 每個類別的指標
print(confusion_matrix$byClass)
# ROC曲線和AUC(多分類)
library(pROC)
# 計算每個類別的ROC
roc_list <- list()
for (i in 1:3) {
actual_binary <- ifelse(test_data$Species == levels(test_data$Species)[i], 1, 0)
roc_list[[i]] <- roc(actual_binary,
as.numeric(rf_predictions == levels(test_data$Species)[i]))
}
# 繪製多條ROC曲線
plot(roc_list[[1]], col = "red", main = "多分類ROC曲線")
lines(roc_list[[2]], col = "blue")
lines(roc_list[[3]], col = "green")
legend("bottomright",
legend = levels(test_data$Species),
col = c("red", "blue", "green"),
lty = 1)
# 迴歸問題評估
# 計算多種迴歸指標
regression_metrics <- function(actual, predicted) {
mae <- mean(abs(actual - predicted))
mse <- mean((actual - predicted)^2)
rmse <- sqrt(mse)
mape <- mean(abs((actual - predicted) / actual)) * 100
r2 <- 1 - sum((actual - predicted)^2) / sum((actual - mean(actual))^2)
return(data.frame(
MAE = mae,
MSE = mse,
RMSE = rmse,
MAPE = mape,
R_squared = r2
))
}
# 應用評估函數
rf_reg_metrics <- regression_metrics(test_boston$medv, rf_reg_pred)
print(rf_reg_metrics)
5.3 模型比較與選擇
# 比較多個模型
library(caret)
# 準備數據
set.seed(123)
train_index <- createDataPartition(iris$Species, p = 0.7, list = FALSE)
train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]
# 定義訓練控制
train_control <- trainControl(
method = "cv",
number = 5,
savePredictions = TRUE,
classProbs = TRUE
)
# 訓練多個模型
models <- list()
# 1. 決策樹
models[["決策樹"]] <- train(Species ~ .,
data = train_data,
method = "rpart",
trControl = train_control,
tuneLength = 5)
# 2. 隨機森林
models[["隨機森林"]] <- train(Species ~ .,
data = train_data,
method = "rf",
trControl = train_control,
tuneLength = 3)
# 3. SVM
models[["SVM"]] <- train(Species ~ .,
data = train_data,
method = "svmRadial",
trControl = train_control,
tuneLength = 5)
# 4. KNN
models[["KNN"]] <- train(Species ~ .,
data = train_data,
method = "knn",
trControl = train_control,
tuneLength = 10)
# 比較模型性能
results <- resamples(models)
summary(results)
# 可視化比較
bwplot(results, main = "模型性能比較")
dotplot(results, main = "模型性能比較")
# 模型性能表格
model_comparison <- data.frame(
Model = names(models),
Accuracy = sapply(models, function(m) max(m$results$Accuracy)),
Kappa = sapply(models, function(m) max(m$results$Kappa))
)
print(model_comparison[order(-model_comparison$Accuracy), ])
# 在測試集上評估最佳模型
best_model <- models[["隨機森林"]]
best_predictions <- predict(best_model, test_data)
best_cm <- confusionMatrix(best_predictions, test_data$Species)
print(best_cm)
🏭 第六部分:實戰案例
案例1:銀行客户流失預測
# 模擬銀行客户數據
set.seed(123)
n_customers <- 1000
customer_data <- data.frame(
customer_id = 1:n_customers,
age = sample(18:80, n_customers, replace = TRUE),
gender = sample(c("Male", "Female"), n_customers, replace = TRUE),
income = round(rnorm(n_customers, mean = 50000, sd = 15000), 0),
balance = round(rnorm(n_customers, mean = 10000, sd = 5000), 0),
credit_score = sample(300:850, n_customers, replace = TRUE),
tenure = sample(1:10, n_customers, replace = TRUE),
num_products = sample(1:5, n_customers, replace = TRUE),
has_credit_card = sample(c(0, 1), n_customers, replace = TRUE, prob = c(0.3, 0.7)),
is_active_member = sample(c(0, 1), n_customers, replace = TRUE, prob = c(0.4, 0.6)),
satisfaction_score = sample(1:5, n_customers, replace = TRUE)
)
# 創建流失標籤(基於特徵)
customer_data$churn <- with(customer_data,
ifelse(
(age > 60 & satisfaction_score < 3) |
(balance < 5000 & income < 40000) |
(credit_score < 600 & num_products == 1),
1, 0))
# 查看流失率
cat("客户流失率:", mean(customer_data$churn) * 100, "%\n")
# 數據預處理
# 處理類別變量
customer_data$gender <- as.factor(customer_data$gender)
customer_data$has_credit_card <- as.factor(customer_data$has_credit_card)
customer_data$is_active_member <- as.factor(customer_data$is_active_member)
customer_data$churn <- as.factor(customer_data$churn)
# 數據分割
set.seed(123)
train_index <- createDataPartition(customer_data$churn, p = 0.7, list = FALSE)
train_data <- customer_data[train_index, ]
test_data <- customer_data[-train_index, ]
# 特徵選擇(移除ID)
features <- setdiff(names(train_data), c("customer_id", "churn"))
# 訓練隨機森林模型
rf_churn_model <- randomForest(
churn ~ . - customer_id,
data = train_data,
ntree = 100,
importance = TRUE,
strata = train_data$churn, # 分層抽樣處理不平衡
sampsize = c("0" = 300, "1" = 300) # 平衡樣本
)
# 特徵重要性
varImpPlot(rf_churn_model, main = "客户流失預測特徵重要性")
# 預測
churn_predictions <- predict(rf_churn_model, test_data, type = "prob")
churn_class <- ifelse(churn_predictions[, 2] > 0.5, 1, 0)
# 評估(處理不平衡數據)
confusion_matrix <- table(Predicted = churn_class, Actual = test_data$churn)
print(confusion_matrix)
# 計算精確率、召回率、F1分數
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
recall <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
f1_score <- 2 * precision * recall / (precision + recall)
cat(sprintf("精確率: %.2f%%\n", precision * 100))
cat(sprintf("召回率: %.2f%%\n", recall * 100))
cat(sprintf("F1分數: %.2f\n", f1_score))
# ROC曲線
library(pROC)
roc_curve <- roc(as.numeric(test_data$churn) - 1, churn_predictions[, 2])
plot(roc_curve, main = "客户流失預測ROC曲線")
auc_value <- auc(roc_curve)
cat(sprintf("AUC值: %.3f\n", auc_value))
案例2:房價預測模型優化
# 波士頓房價預測優化
library(MASS)
data(Boston)
# 數據探索
cat("=== 波士頓房價數據探索 ===\n")
cat("數據維度:", dim(Boston), "\n")
cat("房價統計:\n")
print(summary(Boston$medv))
# 特徵工程
# 1. 創建新特徵
Boston$rooms_per_house <- Boston$rm / Boston$dis
Boston$tax_per_room <- Boston$tax / Boston$rm
Boston$age_squared <- Boston$age^2
# 2. 交互特徵
Boston$nox_indus <- Boston$nox * Boston$indus
Boston$rad_tax <- Boston$rad * Boston$tax
# 3. 對數轉換(處理偏態分佈)
Boston$log_crim <- log(Boston$crim + 1) # 加1避免log(0)
Boston$log_lstat <- log(Boston$lstat + 1)
# 數據分割
set.seed(123)
train_index <- sample(1:nrow(Boston), nrow(Boston) * 0.7)
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]
# 訓練多個迴歸模型
models <- list()
# 線性迴歸
lm_model <- lm(medv ~ ., data = train_data)
models[["線性迴歸"]] <- lm_model
# 嶺迴歸
library(glmnet)
x_train <- model.matrix(medv ~ ., train_data)[, -1]
y_train <- train_data$medv
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0) # alpha=0為嶺迴歸
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = cv_ridge$lambda.min)
models[["嶺迴歸"]] <- ridge_model
# Lasso迴歸
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1) # alpha=1為Lasso
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = cv_lasso$lambda.min)
models[["Lasso迴歸"]] <- lasso_model
# 隨機森林迴歸
rf_model <- randomForest(medv ~ ., data = train_data, ntree = 200)
models[["隨機森林"]] <- rf_model
# XGBoost迴歸
library(xgboost)
# 準備數據
x_train_xgb <- as.matrix(train_data[, -which(names(train_data) == "medv")])
y_train_xgb <- train_data$medv
xgb_model <- xgboost(
data = x_train_xgb,
label = y_train_xgb,
nrounds = 100,
objective = "reg:squarederror",
eta = 0.1,
max_depth = 6,
verbose = 0
)
models[["XGBoost"]] <- xgb_model
# 模型評估函數
evaluate_model <- function(model, model_name, test_data) {
if (model_name %in% c("嶺迴歸", "Lasso迴歸")) {
# glmnet模型
x_test <- model.matrix(medv ~ ., test_data)[, -1]
predictions <- predict(model, newx = x_test)
} else if (model_name == "XGBoost") {
# XGBoost模型
x_test <- as.matrix(test_data[, -which(names(test_data) == "medv")])
predictions <- predict(model, x_test)
} else {
# 其他模型
predictions <- predict(model, test_data)
}
actual <- test_data$medv
mae <- mean(abs(predictions - actual))
rmse <- sqrt(mean((predictions - actual)^2))
r2 <- 1 - sum((actual - predictions)^2) / sum((actual - mean(actual))^2)
return(data.frame(
Model = model_name,
MAE = mae,
RMSE = rmse,
R2 = r2
))
}
# 比較所有模型
results <- data.frame()
for (model_name in names(models)) {
model_result <- evaluate_model(models[[model_name]], model_name, test_data)
results <- rbind(results, model_result)
}
# 排序並顯示結果
results <- results[order(results$RMSE), ]
print(results)
# 可視化比較
library(ggplot2)
ggplot(results, aes(x = reorder(Model, -RMSE), y = RMSE)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "模型性能比較 (