📚 今日目標

  1. 掌握使用rvest包進行網頁爬蟲
  2. 學習使用httr包與API交互
  3. 掌握網頁數據解析和清洗
  4. 學習處理動態網頁(RSelenium)
  5. 瞭解API認證和限制處理

🌐 第一部分:網頁爬蟲基礎

1.1 rvest包基礎

# 安裝必要的包
install.packages(c("rvest", "httr", "xml2", "jsonlite", "RSelenium"))
library(rvest)
library(httr)
library(xml2)

# 基本網頁抓取流程
# 1. 讀取網頁內容
# 2. 解析HTML結構
# 3. 提取所需數據
# 4. 清洗和整理數據

# 簡單的網頁抓取示例
url <- "http://books.toscrape.com/"
webpage <- read_html(url)

# 查看網頁標題
title <- html_text(html_nodes(webpage, "title"))
cat("網頁標題:", title, "\n")

# 查看網頁結構
# 打印前1000個字符的HTML
cat(substr(as.character(webpage), 1, 1000))

1.2 CSS選擇器與XPath

# 使用CSS選擇器提取數據
url <- "http://books.toscrape.com/"

# 提取所有圖書標題
book_titles <- read_html(url) %>%
  html_nodes(".product_pod h3 a") %>%
  html_text()

print(head(book_titles, 10))

# 提取圖書價格
book_prices <- read_html(url) %>%
  html_nodes(".price_color") %>%
  html_text()

print(head(book_prices, 10))

# 提取圖書評分
book_ratings <- read_html(url) %>%
  html_nodes(".star-rating") %>%
  html_attr("class") %>%
  gsub("star-rating ", "", .)

print(head(book_ratings, 10))

# 使用XPath提取數據
# 提取所有鏈接
book_links <- read_html(url) %>%
  html_nodes(xpath = "//h3/a") %>%
  html_attr("href")

print(head(book_links, 10))

# 組合數據
books_data <- data.frame(
  title = book_titles[1:20],
  price = book_prices[1:20],
  rating = book_ratings[1:20],
  link = book_links[1:20]
)

print(head(books_data))

1.3 處理分頁網站

# 抓取多頁數據
base_url <- "http://books.toscrape.com/catalogue/page-"
all_books <- list()

# 抓取前5頁
for (page in 1:5) {
  url <- paste0(base_url, page, ".html")
  cat("正在抓取第", page, "頁:", url, "\n")
  
  tryCatch({
    webpage <- read_html(url)
    
    # 提取數據
    titles <- html_text(html_nodes(webpage, ".product_pod h3 a"))
    prices <- html_text(html_nodes(webpage, ".price_color"))
    ratings <- html_attr(html_nodes(webpage, ".star-rating"), "class")
    ratings <- gsub("star-rating ", "", ratings)
    
    # 存儲數據
    page_data <- data.frame(
      page = page,
      title = titles,
      price = prices,
      rating = ratings,
      stringsAsFactors = FALSE
    )
    
    all_books[[page]] <- page_data
    
    # 禮貌性延遲,避免給服務器造成壓力
    Sys.sleep(1)
    
  }, error = function(e) {
    cat("抓取第", page, "頁時出錯:", e$message, "\n")
  })
}

# 合併所有數據
books_df <- do.call(rbind, all_books)
cat("總共抓取了", nrow(books_df), "本書籍\n")

# 數據清洗
# 移除價格符號
books_df$price_numeric <- as.numeric(gsub("£", "", books_df$price))

# 查看價格分佈
summary(books_df$price_numeric)

# 按評分分組統計
rating_summary <- books_df %>%
  group_by(rating) %>%
  summarize(
    count = n(),
    avg_price = mean(price_numeric),
    min_price = min(price_numeric),
    max_price = max(price_numeric)
  )

print(rating_summary)

🔗 第二部分:API數據獲取

2.1 httr包基礎

# httr包提供了與HTTP交互的功能
library(httr)

# 基本GET請求
response <- GET("https://httpbin.org/get")
status_code(response)  # 狀態碼
http_status(response)   # HTTP狀態
headers(response)       # 響應頭
content(response, "text")  # 響應內容

# 帶參數的GET請求
params <- list(
  page = 1,
  limit = 10,
  sort = "desc"
)

response <- GET("https://httpbin.org/get", query = params)
cat(content(response, "text"))

# POST請求
post_data <- list(
  name = "John Doe",
  email = "john@example.com",
  age = 30
)

response <- POST("https://httpbin.org/post", body = post_data, encode = "form")
cat(content(response, "text"))

# 處理JSON響應
response <- GET("https://httpbin.org/json")
json_data <- content(response, "parsed")
str(json_data, max.level = 2)

2.2 API認證與授權

# 基本認證
# 注意:實際使用時替換為真實的用户名和密碼
username <- "user"
password <- "pass"

response <- GET(
  "https://httpbin.org/basic-auth/user/pass",
  authenticate(username, password, type = "basic")
)

status_code(response)
cat(content(response, "text"))

# API密鑰認證
api_key <- "your_api_key_here"
response <- GET(
  "https://httpbin.org/bearer",
  add_headers(Authorization = paste("Bearer", api_key))
)

# OAuth 2.0認證
# 安裝並加載httr的OAuth支持
library(httr)

# 設置OAuth應用(示例)
app <- oauth_app(
  "github",
  key = "your_client_id",
  secret = "your_client_secret"
)

# 獲取OAuth令牌
github_token <- oauth2.0_token(
  oauth_endpoints("github"),
  app,
  scope = c("public_repo", "user")
)

# 使用令牌訪問API
response <- GET(
  "https://api.github.com/user/repos",
  config(token = github_token)
)

# 檢查響應
if (status_code(response) == 200) {
  repos <- content(response)
  cat("成功獲取", length(repos), "個倉庫\n")
} else {
  cat("請求失敗,狀態碼:", status_code(response), "\n")
}

2.3 處理API響應

# 錯誤處理
safe_get <- function(url) {
  tryCatch({
    response <- GET(url)
    
    if (status_code(response) == 200) {
      return(content(response, "parsed"))
    } else {
      warning(paste("HTTP狀態碼:", status_code(response)))
      return(NULL)
    }
  }, error = function(e) {
    warning(paste("請求出錯:", e$message))
    return(NULL)
  })
}

# 使用安全函數
data <- safe_get("https://httpbin.org/json")
if (!is.null(data)) {
  cat("成功獲取數據\n")
}

# 處理分頁API
fetch_all_pages <- function(base_url, max_pages = 10) {
  all_data <- list()
  page <- 1
  
  while (page <= max_pages) {
    url <- paste0(base_url, "?page=", page)
    cat("獲取第", page, "頁...\n")
    
    response <- GET(url)
    
    if (status_code(response) != 200) {
      cat("獲取第", page, "頁失敗\n")
      break
    }
    
    page_data <- content(response, "parsed")
    
    # 檢查是否還有數據
    if (length(page_data) == 0) {
      cat("沒有更多數據\n")
      break
    }
    
    all_data[[page]] <- page_data
    page <- page + 1
    
    # 避免API限制
    Sys.sleep(0.5)
  }
  
  return(all_data)
}

# 處理速率限制
with_retry <- function(expr, max_retries = 3, delay = 1) {
  retries <- 0
  
  while (retries <= max_retries) {
    result <- tryCatch({
      return(eval(expr))
    }, error = function(e) {
      if (grepl("429", e$message) || grepl("rate limit", e$message, ignore.case = TRUE)) {
        cat("達到速率限制,等待", delay * (2^retries), "秒後重試...\n")
        Sys.sleep(delay * (2^retries))  # 指數退避
        retries <<- retries + 1
        if (retries > max_retries) {
          stop("超過最大重試次數")
        }
      } else {
        stop(e)
      }
    })
  }
}

📊 第三部分:實戰API案例

3.1 天氣API數據獲取

# 使用OpenWeatherMap API(需要註冊獲取API密鑰)
# 注意:這裏使用示例,實際需要替換為有效的API密鑰

get_weather_data <- function(city, api_key) {
  base_url <- "http://api.openweathermap.org/data/2.5/weather"
  
  params <- list(
    q = city,
    appid = api_key,
    units = "metric",  # 使用攝氏度
    lang = "zh_cn"     # 中文描述
  )
  
  response <- GET(base_url, query = params)
  
  if (status_code(response) == 200) {
    weather_data <- content(response, "parsed")
    
    # 提取所需信息
    result <- list(
      city = weather_data$name,
      country = weather_data$sys$country,
      temperature = weather_data$main$temp,
      feels_like = weather_data$main$feels_like,
      humidity = weather_data$main$humidity,
      pressure = weather_data$main$pressure,
      weather = weather_data$weather[[1]]$description,
      wind_speed = weather_data$wind$speed,
      sunrise = format(as.POSIXct(weather_data$sys$sunrise, 
                                  origin = "1970-01-01"), 
                       "%H:%M:%S"),
      sunset = format(as.POSIXct(weather_data$sys$sunset, 
                                 origin = "1970-01-01"), 
                      "%H:%M:%S")
    )
    
    return(result)
  } else {
    warning(paste("獲取天氣數據失敗,狀態碼:", status_code(response)))
    return(NULL)
  }
}

# 獲取多個城市的天氣
get_multiple_cities_weather <- function(cities, api_key) {
  weather_list <- list()
  
  for (city in cities) {
    cat("獲取", city, "的天氣數據...\n")
    weather <- get_weather_data(city, api_key)
    
    if (!is.null(weather)) {
      weather_list[[city]] <- weather
    }
    
    # 避免API限制
    Sys.sleep(1)
  }
  
  # 轉換為數據框
  weather_df <- do.call(rbind, lapply(weather_list, as.data.frame))
  return(weather_df)
}

# 示例使用
# api_key <- "your_openweathermap_api_key"
# cities <- c("Beijing", "Shanghai", "Guangzhou", "Shenzhen")
# weather_data <- get_multiple_cities_weather(cities, api_key)

3.2 新聞API數據獲取

# 使用NewsAPI獲取新聞數據
# 注意:需要註冊獲取API密鑰

fetch_news <- function(query = NULL, category = NULL, country = "us", 
                       page_size = 10, api_key) {
  
  base_url <- "https://newsapi.org/v2/top-headlines"
  
  params <- list(
    apiKey = api_key,
    pageSize = page_size
  )
  
  # 添加查詢參數
  if (!is.null(query)) {
    params$q <- query
  }
  
  if (!is.null(category)) {
    params$category <- category
  }
  
  if (!is.null(country)) {
    params$country <- country
  }
  
  response <- GET(base_url, query = params)
  
  if (status_code(response) == 200) {
    news_data <- content(response, "parsed")
    
    if (news_data$totalResults > 0) {
      # 提取文章信息
      articles <- lapply(news_data$articles, function(article) {
        data.frame(
          source = article$source$name,
          author = ifelse(is.null(article$author), "", article$author),
          title = article$title,
          description = ifelse(is.null(article$description), "", article$description),
          url = article$url,
          published_at = article$publishedAt,
          stringsAsFactors = FALSE
        )
      })
      
      articles_df <- do.call(rbind, articles)
      return(articles_df)
    } else {
      cat("未找到相關新聞\n")
      return(NULL)
    }
  } else {
    warning(paste("獲取新聞失敗,狀態碼:", status_code(response)))
    return(NULL)
  }
}

# 新聞數據分析函數
analyze_news <- function(news_df) {
  if (is.null(news_df)) return(NULL)
  
  cat("=== 新聞數據分析 ===\n")
  
  # 按來源統計
  source_stats <- news_df %>%
    group_by(source) %>%
    summarize(
      article_count = n(),
      latest_article = max(published_at)
    ) %>%
    arrange(desc(article_count))
  
  cat("\n1. 新聞來源分佈:\n")
  print(source_stats)
  
  # 提取關鍵詞(簡單示例)
  extract_keywords <- function(text) {
    words <- unlist(strsplit(tolower(text), "\\W+"))
    words <- words[nchar(words) > 3]
    words <- words[!words %in% stopwords::stopwords("en")]
    return(words)
  }
  
  # 分析標題中的常見詞
  all_titles <- paste(news_df$title, collapse = " ")
  title_words <- extract_keywords(all_titles)
  word_freq <- sort(table(title_words), decreasing = TRUE)
  
  cat("\n2. 標題熱門關鍵詞(前10):\n")
  print(head(word_freq, 10))
  
  return(list(
    source_stats = source_stats,
    word_freq = word_freq
  ))
}

# 示例使用
# api_key <- "your_newsapi_key"
# tech_news <- fetch_news(query = "technology", page_size = 20, api_key = api_key)
# if (!is.null(tech_news)) {
#   analysis <- analyze_news(tech_news)
# }

3.3 金融數據API

# 使用Alpha Vantage獲取股票數據
# 注意:需要註冊獲取API密鑰

get_stock_data <- function(symbol, api_key, output_size = "compact") {
  base_url <- "https://www.alphavantage.co/query"
  
  params <- list(
    function = "TIME_SERIES_DAILY",
    symbol = symbol,
    outputsize = output_size,
    apikey = api_key
  )
  
  response <- GET(base_url, query = params)
  
  if (status_code(response) == 200) {
    stock_data <- content(response, "parsed")
    
    # 檢查API返回的錯誤信息
    if (!is.null(stock_data$`Error Message`)) {
      warning(stock_data$`Error Message`)
      return(NULL)
    }
    
    if (!is.null(stock_data$`Note`)) {
      warning("API調用頻率限制:", stock_data$`Note`)
      return(NULL)
    }
    
    # 提取時間序列數據
    time_series <- stock_data$`Time Series (Daily)`
    
    if (is.null(time_series)) {
      warning("未找到股票數據")
      return(NULL)
    }
    
    # 轉換為數據框
    dates <- names(time_series)
    stock_list <- list()
    
    for (date in dates) {
      daily_data <- time_series[[date]]
      stock_list[[date]] <- data.frame(
        date = as.Date(date),
        open = as.numeric(daily_data$`1. open`),
        high = as.numeric(daily_data$`2. high`),
        low = as.numeric(daily_data$`3. low`),
        close = as.numeric(daily_data$`4. close`),
        volume = as.numeric(daily_data$`5. volume`),
        stringsAsFactors = FALSE
      )
    }
    
    stock_df <- do.call(rbind, stock_list)
    rownames(stock_df) <- NULL
    
    # 按日期排序
    stock_df <- stock_df[order(stock_df$date), ]
    
    # 計算技術指標
    stock_df$returns <- c(NA, diff(log(stock_df$close)))
    stock_df$sma_20 <- TTR::SMA(stock_df$close, n = 20)
    stock_df$sma_50 <- TTR::SMA(stock_df$close, n = 50)
    
    return(stock_df)
  } else {
    warning(paste("獲取股票數據失敗,狀態碼:", status_code(response)))
    return(NULL)
  }
}

# 多股票數據獲取
get_multiple_stocks <- function(symbols, api_key) {
  all_stocks <- list()
  
  for (symbol in symbols) {
    cat("獲取", symbol, "的數據...\n")
    stock_data <- get_stock_data(symbol, api_key)
    
    if (!is.null(stock_data)) {
      stock_data$symbol <- symbol
      all_stocks[[symbol]] <- stock_data
    }
    
    # Alpha Vantage有嚴格的頻率限制(5次/分鐘)
    Sys.sleep(13)  # 等待13秒以避免限制
  }
  
  # 合併所有股票數據
  combined_df <- do.call(rbind, all_stocks)
  return(combined_df)
}

# 股票數據分析
analyze_stocks <- function(stocks_df) {
  if (is.null(stocks_df)) return(NULL)
  
  cat("=== 股票數據分析 ===\n")
  
  # 按股票分組分析
  stock_summary <- stocks_df %>%
    group_by(symbol) %>%
    summarize(
      start_date = min(date),
      end_date = max(date),
      days = n(),
      start_price = first(close),
      end_price = last(close),
      total_return = (end_price - start_price) / start_price * 100,
      avg_daily_volume = mean(volume, na.rm = TRUE),
      volatility = sd(returns, na.rm = TRUE) * sqrt(252) * 100,  # 年化波動率
      max_drawdown = min(returns, na.rm = TRUE) * 100
    ) %>%
    arrange(desc(total_return))
  
  cat("\n1. 股票表現彙總:\n")
  print(stock_summary)
  
  # 相關性分析
  library(tidyr)
  
  # 創建收益矩陣
  returns_matrix <- stocks_df %>%
    select(symbol, date, returns) %>%
    pivot_wider(names_from = symbol, values_from = returns) %>%
    select(-date) %>%
    as.matrix()
  
  # 計算相關性矩陣
  cor_matrix <- cor(returns_matrix, use = "complete.obs")
  
  cat("\n2. 股票收益相關性矩陣:\n")
  print(round(cor_matrix, 3))
  
  return(list(
    summary = stock_summary,
    correlation = cor_matrix
  ))
}

# 示例使用
# api_key <- "your_alphavantage_api_key"
# symbols <- c("AAPL", "MSFT", "GOOGL")
# stocks_data <- get_multiple_stocks(symbols, api_key)
# if (!is.null(stocks_data)) {
#   analysis <- analyze_stocks(stocks_data)
# }

🖥️ 第四部分:動態網頁抓取

4.1 RSelenium基礎

# RSelenium用於處理JavaScript渲染的動態網頁
# 首先需要安裝Java和Docker,或者使用standalone server

# 安裝RSelenium
install.packages("RSelenium")
library(RSelenium)

# 啓動Selenium服務器(方法1:使用Docker)
# 在終端運行: docker run -d -p 4445:4444 selenium/standalone-chrome

# 方法2:使用RSelenium內置方法
# 檢查是否已安裝Java
# system("java -version")

# 啓動Selenium服務器
rD <- rsDriver(
  browser = "chrome",
  chromever = "latest",  # 或指定版本號,如"114.0.5735.90"
  port = 4445L,
  verbose = FALSE
)

# 獲取客户端
remDr <- rD[["client"]]

# 訪問網頁
remDr$navigate("https://www.example.com")

# 獲取頁面標題
title <- remDr$getTitle()
cat("頁面標題:", title, "\n")

# 獲取頁面源代碼
page_source <- remDr$getPageSource()[[1]]

# 關閉連接
remDr$close()
rD[["server"]]$stop()

4.2 動態網頁交互

# 啓動Selenium
rD <- rsDriver(browser = "chrome", port = 4445L, verbose = FALSE)
remDr <- rD[["client"]]

# 訪問示例網站
remDr$navigate("https://www.r-project.org/")

# 查找元素
# 通過ID查找
try({
  search_box <- remDr$findElement(using = "id", "searchfield")
  cat("找到搜索框\n")
})

# 通過CSS選擇器查找
try({
  links <- remDr$findElements(using = "css selector", "a")
  cat("找到", length(links), "個鏈接\n")
})

# 通過XPath查找
try({
  heading <- remDr$findElement(using = "xpath", "//h1")
  cat("找到標題:", heading$getElementText()[[1]], "\n")
})

# 與元素交互
# 輸入文本
try({
  search_box$sendKeysToElement(list("ggplot2", key = "enter"))
  Sys.sleep(2)  # 等待頁面加載
})

# 點擊元素
try({
  first_result <- remDr$findElement(using = "css selector", ".gs-title a")
  first_result$clickElement()
  Sys.sleep(2)
})

# 執行JavaScript
remDr$executeScript("return document.title;")

# 滾動頁面
remDr$executeScript("window.scrollTo(0, document.body.scrollHeight);")
Sys.sleep(1)
remDr$executeScript("window.scrollTo(0, 0);")

# 截圖
remDr$screenshot(file = "screenshot.png")

# 關閉
remDr$close()
rD[["server"]]$stop()

4.3 處理登錄和表單

# 處理需要登錄的網站
# 注意:這裏使用示例網站,實際網站可能不同

# 啓動Selenium
rD <- rsDriver(browser = "chrome", port = 4445L, verbose = FALSE)
remDr <- rD[["client"]]

# 訪問登錄頁面
login_url <- "https://example.com/login"  # 示例URL
remDr$navigate(login_url)
Sys.sleep(2)

# 填寫登錄表單
try({
  # 查找用户名輸入框
  username_input <- remDr$findElement(using = "id", "username")
  username_input$sendKeysToElement(list("your_username"))
  
  # 查找密碼輸入框
  password_input <- remDr$findElement(using = "id", "password")
  password_input$sendKeysToElement(list("your_password"))
  
  # 查找登錄按鈕並點擊
  login_button <- remDr$findElement(using = "css selector", "button[type='submit']")
  login_button$clickElement()
  
  Sys.sleep(3)  # 等待登錄完成
  
  # 檢查是否登錄成功
  current_url <- remDr$getCurrentUrl()[[1]]
  if (!grepl("login", current_url)) {
    cat("登錄成功\n")
  } else {
    cat("登錄可能失敗\n")
  }
})

# 處理其他表單
# 填寫搜索表單
try({
  search_input <- remDr$findElement(using = "name", "q")
  search_input$clearElement()
  search_input$sendKeysToElement(list("data science", key = "enter"))
  Sys.sleep(2)
})

# 處理下拉菜單
try({
  dropdown <- remDr$findElement(using = "id", "dropdown")
  dropdown$clickElement()
  Sys.sleep(0.5)
  
  option <- remDr$findElement(using = "xpath", "//option[@value='option2']")
  option$clickElement()
})

# 處理多選框
try({
  checkbox <- remDr$findElement(using = "id", "agree_terms")
  if (!checkbox$isElementSelected()[[1]]) {
    checkbox$clickElement()
  }
})

# 關閉
remDr$close()
rD[["server"]]$stop()

🛡️ 第五部分:高級技巧與最佳實踐

5.1 代理和用户代理設置

# 設置用户代理
user_agent <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"

# 使用httr設置用户代理
response <- GET(
  "https://httpbin.org/user-agent",
  add_headers(`User-Agent` = user_agent)
)

cat(content(response, "text"))

# 使用代理
# 設置代理服務器
proxy_url <- "http://proxy.example.com:8080"

# 方法1:使用httr的config
response <- GET(
  "https://httpbin.org/ip",
  use_proxy(url = proxy_url, username = "user", password = "pass")
)

# 方法2:設置系統代理
# Sys.setenv(http_proxy = proxy_url)
# Sys.setenv(https_proxy = proxy_url)

# 使用rvest通過代理
session <- session(
  "https://httpbin.org/ip",
  httr::use_proxy(url = proxy_url)
)

# 輪換用户代理和代理
user_agents <- c(
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
)

proxies <- c(
  "http://proxy1.example.com:8080",
  "http://proxy2.example.com:8080",
  "http://proxy3.example.com:8080"
)

# 輪換請求
make_rotating_request <- function(url, user_agents, proxies) {
  # 隨機選擇用户代理和代理
  ua <- sample(user_agents, 1)
  proxy <- sample(proxies, 1)
  
  cat("使用用户代理:", ua, "\n")
  cat("使用代理:", proxy, "\n")
  
  tryCatch({
    response <- GET(
      url,
      add_headers(`User-Agent` = ua),
      use_proxy(url = proxy),
      timeout(10)
    )
    
    return(content(response, "text"))
  }, error = function(e) {
    warning(paste("請求失敗:", e$message))
    return(NULL)
  })
}

5.2 處理驗證碼和反爬機制

# 處理驗證碼的常見策略
# 注意:自動破解驗證碼可能違反網站條款,請謹慎使用

# 1. 使用驗證碼識別服務(需要付費API)
solve_captcha <- function(image_url, api_key) {
  # 這裏使用2Captcha示例
  # 實際使用時需要根據驗證碼服務商的API文檔
  
  solve_url <- "http://2captcha.com/in.php"
  result_url <- "http://2captcha.com/res.php"
  
  # 提交驗證碼
  submit_params <- list(
    key = api_key,
    method = "base64",
    body = image_url,  # 這裏應該是base64編碼的圖像
    json = 1
  )
  
  submit_response <- POST(solve_url, body = submit_params)
  submit_data <- content(submit_response, "parsed")
  
  if (submit_data$status == 1) {
    request_id <- submit_data$request
    
    # 等待並獲取結果
    for (i in 1:30) {  # 最多嘗試30次
      Sys.sleep(5)
      
      result_params <- list(
        key = api_key,
        action = "get",
        id = request_id,
        json = 1
      )
      
      result_response <- GET(result_url, query = result_params)
      result_data <- content(result_response, "parsed")
      
      if (result_data$status == 1) {
        return(result_data$request)  # 返回驗證碼文本
      }
    }
  }
  
  return(NULL)
}

# 2. 避免觸發反爬機制
avoid_anti_scraping <- function() {
  # 隨機延遲
  delay <- runif(1, 1, 5)
  Sys.sleep(delay)
  
  # 模擬人類行為
  # 隨機滾動
  if (runif(1) > 0.7) {
    scroll_amount <- sample(100:500, 1)
    remDr$executeScript(paste0("window.scrollBy(0, ", scroll_amount, ");"))
    Sys.sleep(runif(1, 0.5, 2))
  }
  
  # 隨機移動鼠標(在Selenium中)
  # 實際實現需要更復雜的代碼
}

# 3. 使用會話保持登錄狀態
maintain_session <- function() {
  # 創建會話
  s <- session("https://example.com/login")
  
  # 登錄
  login_form <- list(
    username = "your_username",
    password = "your_password"
  )
  
  s <- s %>%
    session_submit(
      form = login_form,
      submit = "login_button"  # 表單提交按鈕
    )
  
  # 使用同一個會話進行後續請求
  s <- s %>%
    session_jump_to("https://example.com/protected_page")
  
  return(s)
}

5.3 數據存儲和調度

# 將抓取的數據存儲到數據庫
library(DBI)
library(RSQLite)

# 創建數據庫存儲爬蟲數據
setup_crawler_db <- function(db_path = "crawler_data.db") {
  con <- dbConnect(RSQLite::SQLite(), db_path)
  
  # 創建網頁數據表
  dbExecute(con, "
    CREATE TABLE IF NOT EXISTS webpage_data (
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      url TEXT NOT NULL,
      title TEXT,
      content TEXT,
      html TEXT,
      crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
      UNIQUE(url, crawl_time)
    )
  ")
  
  # 創建API數據表
  dbExecute(con, "
    CREATE TABLE IF NOT EXISTS api_data (
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      api_endpoint TEXT NOT NULL,
      parameters TEXT,
      response_data TEXT,
      call_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
  ")
  
  # 創建日誌表
  dbExecute(con, "
    CREATE TABLE IF NOT EXISTS crawl_log (
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      task_name TEXT,
      url TEXT,
      status TEXT,
      message TEXT,
      duration REAL,
      log_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
  ")
  
  dbDisconnect(con)
}

# 存儲網頁數據
store_webpage_data <- function(url, title, content, html, db_path = "crawler_data.db") {
  con <- dbConnect(RSQLite::SQLite(), db_path)
  on.exit(dbDisconnect(con))
  
  # 防止SQL注入
  safe_url <- dbQuoteString(con, url)
  safe_title <- dbQuoteString(con, title)
  safe_content <- dbQuoteString(con, content)
  safe_html <- dbQuoteString(con, html)
  
  sql <- sprintf("
    INSERT INTO webpage_data (url, title, content, html) 
    VALUES (%s, %s, %s, %s)",
    safe_url, safe_title, safe_content, safe_html
  )
  
  dbExecute(con, sql)
}

# 記錄爬蟲日誌
log_crawl <- function(task_name, url, status, message, duration, 
                      db_path = "crawler_data.db") {
  con <- dbConnect(RSQLite::SQLite(), db_path)
  on.exit(dbDisconnect(con))
  
  sql <- sprintf("
    INSERT INTO crawl_log (task_name, url, status, message, duration) 
    VALUES ('%s', '%s', '%s', '%s', %f)",
    task_name, url, status, message, duration
  )
  
  dbExecute(con, sql)
}

# 定時爬蟲任務
schedule_crawler <- function() {
  # 使用cronR包進行任務調度
  install.packages("cronR")
  library(cronR)
  
  # 創建爬蟲腳本
  crawler_script <- "
  library(rvest)
  library(httr)
  source('crawler_functions.R')
  
  # 執行爬蟲任務
  result <- crawl_news_sites()
  
  # 記錄結果
  saveRDS(result, paste0('crawl_results_', Sys.Date(), '.rds'))
  "
  
  writeLines(crawler_script, "daily_crawler.R")
  
  # 創建cron任務(每天凌晨2點運行)
  cmd <- cron_rscript("daily_crawler.R")
  cron_add(command = cmd, frequency = 'daily', at = '02:00', 
           id = 'daily_news_crawl', description = '每日新聞爬取')
  
  # 列出所有任務
  cron_ls()
}

🏭 第六部分:綜合實戰案例

案例1:房地產價格監控系統

# 房地產價格監控爬蟲
library(rvest)
library(httr)
library(dplyr)
library(DBI)
library(RSQLite)

# 配置數據庫
setup_realestate_db <- function() {
  con <- dbConnect(RSQLite::SQLite(), "realestate.db")
  
  # 創建房源表
  dbExecute(con, "
    CREATE TABLE IF NOT EXISTS properties (
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      source TEXT NOT NULL,
      property_id TEXT UNIQUE,
      title TEXT,
      price REAL,
      location TEXT,
      area REAL,
      bedrooms INTEGER,
      bathrooms INTEGER,
      property_type TEXT,
      listing_date DATE,
      url TEXT UNIQUE,
      crawl_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
      last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
  ")
  
  # 創建價格歷史表
  dbExecute(con, "
    CREATE TABLE IF NOT EXISTS price_history (
      id INTEGER PRIMARY KEY AUTOINCREMENT,
      property_id TEXT,
      price REAL,
      record_date DATE,
      FOREIGN KEY (property_id) REFERENCES properties(property_id)
    )
  ")
  
  dbDisconnect(con)
}

# 鏈家爬蟲函數
crawl_lianjia <- function(city = "bj", district = NULL, max_pages = 3) {
  base_url <- paste0("https://", city, ".lianjia.com/ershoufang/")
  
  if (!is.null(district)) {
    base_url <- paste0(base_url, district, "/")
  }
  
  all_properties <- list()
  
  for (page in 1:max_pages) {
    url <- paste0(base_url, "pg", page)
    cat("抓取鏈家第", page, "頁:", url, "\n")
    
    tryCatch({
      # 設置請求頭
      headers <- add_headers(
        `User-Agent` = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        `Accept` = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
      )
      
      response <- GET(url, headers)
      
      if (status_code(response) != 200) {
        cat("請求失敗,狀態碼:", status_code(response), "\n")
        break
      }
      
      webpage <- read_html(content(response, "text"))
      
      # 提取房源列表
      listings <- html_nodes(webpage, ".sellListContent li")
      
      for (listing in listings) {
        try({
          # 提取房源信息
          title <- html_text(html_node(listing, ".title a"))
          price <- html_text(html_node(listing, ".totalPrice span"))
          price <- as.numeric(gsub("[^0-9.]", "", price))
          
          # 提取房屋信息
          house_info <- html_text(html_node(listing, ".houseInfo"))
          house_info_parts <- strsplit(house_info, "\\|")[[1]]
          
          # 解析房屋信息
          location <- trimws(house_info_parts[1])
          layout <- trimws(house_info_parts[2])
          area <- as.numeric(gsub("[^0-9.]", "", house_info_parts[3]))
          
          # 提取户型信息
          bedrooms <- as.numeric(gsub("室.*", "", gsub("[^0-9]", "", layout)))
          bathrooms <- as.numeric(gsub(".*室|衞.*", "", gsub("[^0-9]", "", layout)))
          
          # 提取鏈接
          link <- html_attr(html_node(listing, ".title a"), "href")
          property_id <- gsub(".*/|.html", "", link)
          
          # 存儲數據
          property_data <- data.frame(
            source = "lianjia",
            property_id = property_id,
            title = title,
            price = price,
            location = location,
            area = area,
            bedrooms = bedrooms,
            bathrooms = bathrooms,
            property_type = "二手房",
            url = link,
            stringsAsFactors = FALSE
          )
          
          all_properties[[length(all_properties) + 1]] <- property_data
        }, error = function(e) {
          # 忽略單個房源的錯誤
        })
      }
      
      # 延遲避免被屏蔽
      Sys.sleep(runif(1, 1, 3))
      
    }, error = function(e) {
      cat("抓取第", page, "頁時出錯:", e$message, "\n")
    })
  }
  
  # 合併所有數據
  if (length(all_properties) > 0) {
    properties_df <- do.call(rbind, all_properties)
    return(properties_df)
  } else {
    return(NULL)
  }
}

# 存儲到數據庫
store_properties <- function(properties_df, db_path = "realestate.db") {
  con <- dbConnect(RSQLite::SQLite(), db_path)
  on.exit(dbDisconnect(con))
  
  # 檢查數據庫是否已存在相同property_id的記錄
  existing_ids <- dbGetQuery(con, "SELECT property_id FROM properties")$property_id
  
  new_properties <- properties_df[!properties_df$property_id %in% existing_ids, ]
  existing_properties <- properties_df[properties_df$property_id %in% existing_ids, ]
  
  # 插入新記錄
  if (nrow(new_properties) > 0) {
    dbWriteTable(con, "properties", new_properties, append = TRUE, row.names = FALSE)
    cat("插入了", nrow(new_properties), "條新記錄\n")
  }
  
  # 更新現有記錄的價格
  if (nrow(existing_properties) > 0) {
    for (i in 1:nrow(existing_properties)) {
      prop <- existing_properties[i, ]
      
      # 檢查價格是否有變化
      old_price <- dbGetQuery(con, 
        sprintf("SELECT price FROM properties WHERE property_id = '%s'", prop$property_id)
      )$price
      
      if (!is.na(old_price) && old_price != prop$price) {
        # 更新價格
        dbExecute(con, sprintf("
          UPDATE properties 
          SET price = %.2f, last_updated = CURRENT_TIMESTAMP 
          WHERE property_id = '%s'",
          prop$price, prop$property_id
        ))
        
        # 記錄價格歷史
        dbExecute(con, sprintf("
          INSERT INTO price_history (property_id, price, record_date) 
          VALUES ('%s', %.2f, DATE('now'))",
          prop$property_id, prop$price
        ))
        
        cat("更新了房源", prop$property_id, "的價格: ", old_price, " -> ", prop$price, "\n")
      }
    }
  }
}

# 數據分析函數
analyze_realestate <- function(db_path = "realestate.db") {
  con <- dbConnect(RSQLite::SQLite(), db_path)
  on.exit(dbDisconnect(con))
  
  cat("=== 房地產數據分析 ===\n\n")
  
  # 1. 基本統計
  basic_stats <- dbGetQuery(con, "
    SELECT 
      COUNT(*) as total_properties,
      COUNT(DISTINCT location) as distinct_locations,
      AVG(price) as avg_price,
      MIN(price) as min_price,
      MAX(price) as max_price,
      AVG(area) as avg_area,
      AVG(price/area) as avg_price_per_sqm
    FROM properties
  ")
  
  cat("1. 基本統計:\n")
  print(basic_stats)
  
  # 2. 價格分佈
  price_distribution <- dbGetQuery(con, "
    SELECT 
      CASE 
        WHEN price < 100 THEN '低於100萬'
        WHEN price < 300 THEN '100-300萬'
        WHEN price < 500 THEN '300-500萬'
        WHEN price < 1000 THEN '500-1000萬'
        ELSE '1000萬以上'
      END as price_range,
      COUNT(*) as property_count,
      AVG(price) as avg_price_in_range,
      AVG(area) as avg_area_in_range
    FROM properties
    GROUP BY price_range
    ORDER BY MIN(price)
  ")
  
  cat("\n2. 價格分佈:\n")
  print(price_distribution)
  
  # 3. 區域分析
  area_analysis <- dbGetQuery(con, "
    SELECT 
      location,
      COUNT(*) as property_count,
      AVG(price) as avg_price,
      AVG(area) as avg_area,
      AVG(price/area) as avg_price_per_sqm
    FROM properties
    GROUP BY location
    HAVING COUNT(*) >= 5
    ORDER BY avg_price_per_sqm DESC
    LIMIT 10
  ")
  
  cat("\n3. 最貴區域(單價前10):\n")
  print(area_analysis)
  
  # 4. 户型分析
  layout_analysis <- dbGetQuery(con, "
    SELECT 
      bedrooms,
      bathrooms,
      COUNT(*) as property_count,
      AVG(price) as avg_price,
      AVG(area) as avg_area
    FROM properties
    WHERE bedrooms IS NOT NULL AND bathrooms IS NOT NULL
    GROUP BY bedrooms, bathrooms
    ORDER BY bedrooms, bathrooms
  ")
  
  cat("\n4. 户型分析:\n")
  print(layout_analysis)
  
  return(list(
    basic_stats = basic_stats,
    price_distribution = price_distribution,
    area_analysis = area_analysis,
    layout_analysis = layout_analysis
  ))
}

# 主函數
main_realestate_crawler <- function() {
  # 初始化數據庫
  setup_realestate_db()
  
  # 抓取數據
  cat("開始抓取鏈家數據...\n")
  properties <- crawl_lianjia(city = "bj", max_pages = 2)
  
  if (!is.null(properties)) {
    cat("成功抓取", nrow(properties), "條房源數據\n")
    
    # 存儲數據
    store_properties(properties)
    
    # 分析數據
    analysis <- analyze_realestate()
    
    # 生成報告
    generate_report(analysis)
  } else {
    cat("抓取失敗\n")
  }
}

# 生成HTML報告
generate_report <- function(analysis, output_file = "realestate_report.html") {
  library(knitr)
  library(rmarkdown)
  
  report_template <- "
# 房地產價格監控報告
生成時間: `r Sys.time()`

## 1. 數據概覽
- 總房源數: `r analysis$basic_stats$total_properties`
- 平均價格: `r round(analysis$basic_stats$avg_price, 2)` 萬元
- 平均面積: `r round(analysis$basic_stats$avg_area, 2)` 平方米
- 平均單價: `r round(analysis$basic_stats$avg_price_per_sqm, 2)` 萬元/平方米

## 2. 價格分佈
```{r, echo=FALSE}
knitr::kable(analysis$price_distribution)

3. 區域分析

knitr::kable(analysis$area_analysis)

4. 户型分析

knitr::kable(analysis$layout_analysis)

"

writeLines(report_template, "report_template.Rmd") rmarkdown::render("report_template.Rmd", output_file = output_file)

cat("報告已生成:", output_file, "\n") }

運行主函數

main_realestate_crawler()

### 案例2:社交媒體監控系統
```r
# 社交媒體監控系統
library(httr)
library(jsonlite)
library(dplyr)
library(lubridate)

# 配置多個社交媒體API
# 注意:以下代碼需要相應的API密鑰

# Twitter API v2監控
monitor_twitter <- function(query, api_key, max_results = 100) {
  base_url <- "https://api.twitter.com/2/tweets/search/recent"
  
  headers <- add_headers(
    `Authorization` = paste("Bearer", api_key)
  )
  
  params <- list(
    query = query,
    max_results = max_results,
    "tweet.fields" = "created_at,public_metrics,entities",
    "user.fields" = "username,name"
  )
  
  response <- GET(base_url, headers, query = params)
  
  if (status_code(response) == 200) {
    tweets_data <- content(response, "parsed")
    
    if (!is.null(tweets_data$data)) {
      # 處理推文數據
      tweets_list <- lapply(tweets_data$data, function(tweet) {
        data.frame(
          platform = "Twitter",
          tweet_id = tweet$id,
          text = tweet$text,
          created_at = tweet$created_at,
          retweet_count = tweet$public_metrics$retweet_count,
          reply_count = tweet$public_metrics$reply_count,
          like_count = tweet$public_metrics$like_count,
          quote_count = tweet$public_metrics$quote_count,
          stringsAsFactors = FALSE
        )
      })
      
      tweets_df <- do.call(rbind, tweets_list)
      return(tweets_df)
    }
  }
  
  return(NULL)
}

# Reddit API監控
monitor_reddit <- function(subreddit, query = NULL, limit = 100) {
  base_url <- paste0("https://www.reddit.com/r/", subreddit, "/new.json")
  
  params <- list(limit = limit)
  
  if (!is.null(query)) {
    base_url <- paste0("https://www.reddit.com/r/", subreddit, "/search.json")
    params$q = query
    params$restrict_sr = "on"
    params$sort = "new"
  }
  
  headers <- add_headers(
    `User-Agent` = "SocialMediaMonitor/1.0"
  )
  
  response <- GET(base_url, headers, query = params)
  
  if (status_code(response) == 200) {
    reddit_data <- content(response, "parsed")
    
    posts_list <- lapply(reddit_data$data$children, function(post) {
      post_data <- post$data
      
      data.frame(
        platform = "Reddit",
        post_id = post_data$id,
        title = post_data$title,
        text = post_data$selftext,
        subreddit = post_data$subreddit,
        author = post_data$author,
        created_utc = as.POSIXct(post_data$created_utc, origin = "1970-01-01"),
        score = post_data$score,
        num_comments = post_data$num_comments,
        upvote_ratio = post_data$upvote_ratio,
        url = post_data$url,
        stringsAsFactors = FALSE
      )
    })
    
    posts_df <- do.call(rbind, posts_list)
    return(posts_df)
  }
  
  return(NULL)
}

# 情感分析函數
analyze_sentiment <- function(texts) {
  # 簡單的情感分析(基於詞典)
  # 實際應用中可以使用更復雜的NLP模型
  
  positive_words <- c("好", "優秀", "棒", "贊", "喜歡", "愛", "高興", "開心",
                     "完美", "精彩", "推薦", "支持", "感謝", "棒極了")
  
  negative_words <- c("差", "糟糕", "爛", "討厭", "恨", "生氣", "傷心",
                     "失望", "垃圾", "問題", "投訴", "不好", "不行")
  
  sentiments <- sapply(texts, function(text) {
    text_lower <- tolower(text)
    
    pos_count <- sum(sapply(positive_words, function(word) {
      grepl(word, text_lower)
    }))
    
    neg_count <- sum(sapply(negative_words, function(word) {
      grepl(word, text_lower)
    }))
    
    if (pos_count > neg_count) {
      return("積極")
    } else if (neg_count > pos_count) {
      return("消極")
    } else {
      return("中性")
    }
  })
  
  return(sentiments)
}

# 趨勢分析
analyze_trends <- function(social_data, time_window = "hour") {
  if (is.null(social_data) || nrow(social_data) == 0) {
    return(NULL)
  }
  
  # 確保有時間戳列
  if (!"created_at" %in% names(social_data)) {
    if ("created_utc" %in% names(social_data)) {
      social_data$created_at <- social_data$created_utc
    } else {
      social_data$created_at <- Sys.time()
    }
  }
  
  # 按時間窗口分組
  social_data$time_group <- floor_date(social_data$created_at, time_window)
  
  trends <- social_data %>%
    group_by(platform, time_group) %>%
    summarize(
      post_count = n(),
      avg_score = mean(score, na.rm = TRUE),
      total_comments = sum(num_comments, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(time_group)
  
  return(trends)
}

# 關鍵詞提取
extract_keywords <- function(texts, n = 10) {
  all_text <- paste(texts, collapse = " ")
  
  # 簡單的關鍵詞提取(基於詞頻)
  words <- unlist(strsplit(all_text, "\\W+"))
  words <- words[nchar(words) > 1]
  words <- tolower(words)
  
  # 移除停用詞
  stopwords_ch <- c("的", "了", "在", "是", "我", "有", "和", "就", 
                    "不", "人", "都", "一", "一個", "上", "也", "很", 
                    "到", "説", "要", "去", "你", "會", "着", "沒有",
                    "看", "好", "自己", "這")
  
  words <- words[!words %in% stopwords_ch]
  
  word_freq <- sort(table(words), decreasing = TRUE)
  
  return(head(word_freq, n))
}

# 主監控函數
social_media_monitor <- function(queries, duration_hours = 24) {
  all_data <- list()
  
  # 配置API密鑰(實際使用時從環境變量或配置文件讀取)
  # twitter_api_key <- Sys.getenv("TWITTER_API_KEY")
  # reddit_client_id <- Sys.getenv("REDDIT_CLIENT_ID")
  # reddit_client_secret <- Sys.getenv("REDDIT_CLIENT_SECRET")
  
  for (query in queries) {
    cat("監控關鍵詞:", query, "\n")
    
    # 監控Twitter
    cat("  獲取Twitter數據...\n")
    twitter_data <- monitor_twitter(query, api_key = "dummy_key", max_results = 50)
    
    if (!is.null(twitter_data)) {
      twitter_data$query <- query
      twitter_data$sentiment <- analyze_sentiment(twitter_data$text)
      all_data[["twitter"]] <- rbind(all_data[["twitter"]], twitter_data)
    }
    
    # 監控Reddit
    cat("  獲取Reddit數據...\n")
    reddit_data <- monitor_reddit("all", query = query, limit = 50)
    
    if (!is.null(reddit_data)) {
      reddit_data$query <- query
      reddit_data$sentiment <- analyze_sentiment(paste(reddit_data$title, reddit_data$text))
      all_data[["reddit"]] <- rbind(all_data[["reddit"]], reddit_data)
    }
    
    # 延遲避免API限制
    Sys.sleep(2)
  }
  
  # 合併所有數據
  combined_data <- do.call(rbind, all_data)
  
  if (!is.null(combined_data) && nrow(combined_data) > 0) {
    # 分析趨勢
    trends <- analyze_trends(combined_data, "hour")
    
    # 提取熱門關鍵詞
    all_texts <- c(
      if (!is.null(all_data[["twitter"]])) all_data[["twitter"]]$text,
      if (!is.null(all_data[["reddit"]])) paste(all_data[["reddit"]]$title, all_data[["reddit"]]$text)
    )
    
    keywords <- extract_keywords(all_texts, 20)
    
    # 情感分佈
    sentiment_dist <- if (!is.null(combined_data$sentiment)) {
      table(combined_data$sentiment)
    } else {
      NULL
    }
    
    return(list(
      raw_data = combined_data,
      trends = trends,
      keywords = keywords,
      sentiment = sentiment_dist,
      summary = list(
        total_posts = nrow(combined_data),
        platforms = unique(combined_data$platform),
        time_range = range(combined_data$created_at, na.rm = TRUE)
      )
    ))
  }
  
  return(NULL)
}

# 生成監控報告
generate_social_report <- function(monitor_results, output_file = "social_media_report.html") {
  library(ggplot2)
  library(plotly)
  
  if (is.null(monitor_results)) {
    cat("沒有數據可生成報告\n")
    return(NULL)
  }
  
  # 創建報告
  report <- list()
  
  # 1. 摘要
  report$summary <- monitor_results$summary
  
  # 2. 趨勢圖表
  if (!is.null(monitor_results$trends)) {
    trend_plot <- ggplot(monitor_results$trends, 
                         aes(x = time_group, y = post_count, color = platform)) +
      geom_line() +
      geom_point() +
      labs(title = "社交媒體趨勢", x = "時間", y = "發帖數量") +
      theme_minimal()
    
    report$trend_plot <- trend_plot
  }
  
  # 3. 關鍵詞雲數據
  if (!is.null(monitor_results$keywords)) {
    keyword_df <- data.frame(
      word = names(monitor_results$keywords),
      freq = as.numeric(monitor_results$keywords)
    )
    
    report$keywords <- keyword_df
  }
  
  # 4. 情感分析
  if (!is.null(monitor_results$sentiment)) {
    sentiment_df <- data.frame(
      sentiment = names(monitor_results$sentiment),
      count = as.numeric(monitor_results$sentiment)
    )
    
    sentiment_plot <- ggplot(sentiment_df, aes(x = sentiment, y = count, fill = sentiment)) +
      geom_bar(stat = "identity") +
      labs(title = "情感分析", x = "情感", y = "數量") +
      theme_minimal()
    
    report$sentiment_plot <- sentiment_plot
  }
  
  # 5. 熱門內容
  if (!is.null(monitor_results$raw_data)) {
    top_posts <- monitor_results$raw_data %>%
      arrange(desc(score)) %>%
      head(10) %>%
      select(platform, text, score, sentiment, created_at)
    
    report$top_posts <- top_posts
  }
  
  # 保存報告
  saveRDS(report, gsub(".html", ".rds", output_file))
  
  # 生成HTML報告
  html_report <- "
  <!DOCTYPE html>
  <html>
  <head>
    <title>社交媒體監控報告</title>
    <style>
      body { font-family: Arial, sans-serif; margin: 40px; }
      .section { margin-bottom: 40px; border-bottom: 1px solid #ddd; padding-bottom: 20px; }
      h1, h2 { color: #333; }
      table { border-collapse: collapse; width: 100%; }
      th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
      th { background-color: #f2f2f2; }
    </style>
  </head>
  <body>
    <h1>社交媒體監控報告</h1>
    <p>生成時間: %s</p>
    
    <div class='section'>
      <h2>1. 數據摘要</h2>
      <p>總髮帖數: %d</p>
      <p>監控平台: %s</p>
      <p>時間範圍: %s 到 %s</p>
    </div>
  </body>
  </html>
  "
  
  html_content <- sprintf(html_report,
                         Sys.time(),
                         report$summary$total_posts,
                         paste(report$summary$platforms, collapse = ", "),
                         format(report$summary$time_range[1], "%Y-%m-%d %H:%M:%S"),
                         format(report$summary$time_range[2], "%Y-%m-%d %H:%M:%S"))
  
  writeLines(html_content, output_file)
  cat("報告已生成:", output_file, "\n")
  
  return(report)
}

# 示例使用
# queries <- c("數據科學", "機器學習", "人工智能")
# results <- social_media_monitor(queries, duration_hours = 1)
# if (!is.null(results)) {
#   report <- generate_social_report(results)
# }

💻 今日練習

練習1:簡單網頁爬蟲

# 1. 選擇任意新聞網站(如:新華網、人民網等)
# 2. 使用rvest抓取首頁新聞標題和鏈接
# 3. 提取每條新聞的發佈時間和摘要
# 4. 將數據保存到CSV文件
# 5. 添加錯誤處理和延遲,避免被屏蔽

練習2:API數據獲取

# 1. 註冊OpenWeatherMap免費API賬號
# 2. 編寫函數獲取指定城市的天氣數據
# 3. 獲取5個主要城市的天氣信息
# 4. 分析並比較這些城市的天氣狀況
# 5. 將結果可視化(温度、濕度、風速等)

練習3:動態內容抓取

# 1. 使用RSelenium訪問一個使用JavaScript渲染的網站
# 2. 模擬用户行為:搜索、翻頁、點擊等
# 3. 抓取至少3頁的動態內容
# 4. 將抓取的數據存儲到數據庫
# 5. 實現簡單的數據分析和報告生成

📌 今日總結

重點掌握:

  • ✓ 使用rvest包進行靜態網頁爬蟲
  • ✓ 使用httr包與API交互
  • ✓ 網頁數據解析(CSS選擇器、XPath)
  • ✓ 處理動態網頁(RSelenium)
  • ✓ API認證和錯誤處理

網絡爬蟲注意事項:

  1. 遵守robots.txt:尊重網站的爬蟲政策
  2. 設置合理的延遲:避免對服務器造成壓力
  3. 處理異常:網絡請求可能失敗,要有錯誤處理
  4. 尊重版權:不要濫用爬取的數據
  5. 注意法律風險:某些數據可能受法律保護

API使用要點:

  1. 閲讀文檔:仔細閲讀API文檔,瞭解限制和用法
  2. 管理密鑰:安全存儲API密鑰,不要硬編碼在代碼中
  3. 處理限制:瞭解並遵守API的速率限制
  4. 錯誤處理:處理各種HTTP狀態碼
  5. 數據緩存:緩存API響應以減少請求次數

🎯 明日預告

第十三天:文本挖掘與自然語言處理

  • 文本預處理技術
  • 詞頻分析和TF-IDF
  • 情感分析
  • 主題建模(LDA)
  • 文本分類

學習建議

  1. 從簡單的靜態網站開始練習爬蟲
  2. 使用免費API進行實踐,如OpenWeatherMap、NewsAPI等
  3. 注意網絡爬蟲的倫理和法律問題
  4. 學習查看網頁源代碼,理解HTML結構
  5. 實際項目中,考慮使用現成的數據源或API,避免重複造輪子

記得保存今天的代碼腳本為day12.R,明天我們將進入文本挖掘的世界!📝