📚 今日目標
- 掌握使用rvest包進行網頁爬蟲
- 學習使用httr包與API交互
- 掌握網頁數據解析和清洗
- 學習處理動態網頁(RSelenium)
- 瞭解API認證和限制處理
🌐 第一部分:網頁爬蟲基礎
1.1 rvest包基礎
# 安裝必要的包
install.packages(c("rvest", "httr", "xml2", "jsonlite", "RSelenium"))
library(rvest)
library(httr)
library(xml2)
# 基本網頁抓取流程
# 1. 讀取網頁內容
# 2. 解析HTML結構
# 3. 提取所需數據
# 4. 清洗和整理數據
# 簡單的網頁抓取示例
url <- "http://books.toscrape.com/"
webpage <- read_html(url)
# 查看網頁標題
title <- html_text(html_nodes(webpage, "title"))
cat("網頁標題:", title, "\n")
# 查看網頁結構
# 打印前1000個字符的HTML
cat(substr(as.character(webpage), 1, 1000))
1.2 CSS選擇器與XPath
# 使用CSS選擇器提取數據
url <- "http://books.toscrape.com/"
# 提取所有圖書標題
book_titles <- read_html(url) %>%
html_nodes(".product_pod h3 a") %>%
html_text()
print(head(book_titles, 10))
# 提取圖書價格
book_prices <- read_html(url) %>%
html_nodes(".price_color") %>%
html_text()
print(head(book_prices, 10))
# 提取圖書評分
book_ratings <- read_html(url) %>%
html_nodes(".star-rating") %>%
html_attr("class") %>%
gsub("star-rating ", "", .)
print(head(book_ratings, 10))
# 使用XPath提取數據
# 提取所有鏈接
book_links <- read_html(url) %>%
html_nodes(xpath = "//h3/a") %>%
html_attr("href")
print(head(book_links, 10))
# 組合數據
books_data <- data.frame(
title = book_titles[1:20],
price = book_prices[1:20],
rating = book_ratings[1:20],
link = book_links[1:20]
)
print(head(books_data))
1.3 處理分頁網站
# 抓取多頁數據
base_url <- "http://books.toscrape.com/catalogue/page-"
all_books <- list()
# 抓取前5頁
for (page in 1:5) {
url <- paste0(base_url, page, ".html")
cat("正在抓取第", page, "頁:", url, "\n")
tryCatch({
webpage <- read_html(url)
# 提取數據
titles <- html_text(html_nodes(webpage, ".product_pod h3 a"))
prices <- html_text(html_nodes(webpage, ".price_color"))
ratings <- html_attr(html_nodes(webpage, ".star-rating"), "class")
ratings <- gsub("star-rating ", "", ratings)
# 存儲數據
page_data <- data.frame(
page = page,
title = titles,
price = prices,
rating = ratings,
stringsAsFactors = FALSE
)
all_books[[page]] <- page_data
# 禮貌性延遲,避免給服務器造成壓力
Sys.sleep(1)
}, error = function(e) {
cat("抓取第", page, "頁時出錯:", e$message, "\n")
})
}
# 合併所有數據
books_df <- do.call(rbind, all_books)
cat("總共抓取了", nrow(books_df), "本書籍\n")
# 數據清洗
# 移除價格符號
books_df$price_numeric <- as.numeric(gsub("£", "", books_df$price))
# 查看價格分佈
summary(books_df$price_numeric)
# 按評分分組統計
rating_summary <- books_df %>%
group_by(rating) %>%
summarize(
count = n(),
avg_price = mean(price_numeric),
min_price = min(price_numeric),
max_price = max(price_numeric)
)
print(rating_summary)
🔗 第二部分:API數據獲取
2.1 httr包基礎
# httr包提供了與HTTP交互的功能
library(httr)
# 基本GET請求
response <- GET("https://httpbin.org/get")
status_code(response) # 狀態碼
http_status(response) # HTTP狀態
headers(response) # 響應頭
content(response, "text") # 響應內容
# 帶參數的GET請求
params <- list(
page = 1,
limit = 10,
sort = "desc"
)
response <- GET("https://httpbin.org/get", query = params)
cat(content(response, "text"))
# POST請求
post_data <- list(
name = "John Doe",
email = "john@example.com",
age = 30
)
response <- POST("https://httpbin.org/post", body = post_data, encode = "form")
cat(content(response, "text"))
# 處理JSON響應
response <- GET("https://httpbin.org/json")
json_data <- content(response, "parsed")
str(json_data, max.level = 2)
2.2 API認證與授權
# 基本認證
# 注意:實際使用時替換為真實的用户名和密碼
username <- "user"
password <- "pass"
response <- GET(
"https://httpbin.org/basic-auth/user/pass",
authenticate(username, password, type = "basic")
)
status_code(response)
cat(content(response, "text"))
# API密鑰認證
api_key <- "your_api_key_here"
response <- GET(
"https://httpbin.org/bearer",
add_headers(Authorization = paste("Bearer", api_key))
)
# OAuth 2.0認證
# 安裝並加載httr的OAuth支持
library(httr)
# 設置OAuth應用(示例)
app <- oauth_app(
"github",
key = "your_client_id",
secret = "your_client_secret"
)
# 獲取OAuth令牌
github_token <- oauth2.0_token(
oauth_endpoints("github"),
app,
scope = c("public_repo", "user")
)
# 使用令牌訪問API
response <- GET(
"https://api.github.com/user/repos",
config(token = github_token)
)
# 檢查響應
if (status_code(response) == 200) {
repos <- content(response)
cat("成功獲取", length(repos), "個倉庫\n")
} else {
cat("請求失敗,狀態碼:", status_code(response), "\n")
}
2.3 處理API響應
# 錯誤處理
safe_get <- function(url) {
tryCatch({
response <- GET(url)
if (status_code(response) == 200) {
return(content(response, "parsed"))
} else {
warning(paste("HTTP狀態碼:", status_code(response)))
return(NULL)
}
}, error = function(e) {
warning(paste("請求出錯:", e$message))
return(NULL)
})
}
# 使用安全函數
data <- safe_get("https://httpbin.org/json")
if (!is.null(data)) {
cat("成功獲取數據\n")
}
# 處理分頁API
fetch_all_pages <- function(base_url, max_pages = 10) {
all_data <- list()
page <- 1
while (page <= max_pages) {
url <- paste0(base_url, "?page=", page)
cat("獲取第", page, "頁...\n")
response <- GET(url)
if (status_code(response) != 200) {
cat("獲取第", page, "頁失敗\n")
break
}
page_data <- content(response, "parsed")
# 檢查是否還有數據
if (length(page_data) == 0) {
cat("沒有更多數據\n")
break
}
all_data[[page]] <- page_data
page <- page + 1
# 避免API限制
Sys.sleep(0.5)
}
return(all_data)
}
# 處理速率限制
with_retry <- function(expr, max_retries = 3, delay = 1) {
retries <- 0
while (retries <= max_retries) {
result <- tryCatch({
return(eval(expr))
}, error = function(e) {
if (grepl("429", e$message) || grepl("rate limit", e$message, ignore.case = TRUE)) {
cat("達到速率限制,等待", delay * (2^retries), "秒後重試...\n")
Sys.sleep(delay * (2^retries)) # 指數退避
retries <<- retries + 1
if (retries > max_retries) {
stop("超過最大重試次數")
}
} else {
stop(e)
}
})
}
}
📊 第三部分:實戰API案例
3.1 天氣API數據獲取
# 使用OpenWeatherMap API(需要註冊獲取API密鑰)
# 注意:這裏使用示例,實際需要替換為有效的API密鑰
get_weather_data <- function(city, api_key) {
base_url <- "http://api.openweathermap.org/data/2.5/weather"
params <- list(
q = city,
appid = api_key,
units = "metric", # 使用攝氏度
lang = "zh_cn" # 中文描述
)
response <- GET(base_url, query = params)
if (status_code(response) == 200) {
weather_data <- content(response, "parsed")
# 提取所需信息
result <- list(
city = weather_data$name,
country = weather_data$sys$country,
temperature = weather_data$main$temp,
feels_like = weather_data$main$feels_like,
humidity = weather_data$main$humidity,
pressure = weather_data$main$pressure,
weather = weather_data$weather[[1]]$description,
wind_speed = weather_data$wind$speed,
sunrise = format(as.POSIXct(weather_data$sys$sunrise,
origin = "1970-01-01"),
"%H:%M:%S"),
sunset = format(as.POSIXct(weather_data$sys$sunset,
origin = "1970-01-01"),
"%H:%M:%S")
)
return(result)
} else {
warning(paste("獲取天氣數據失敗,狀態碼:", status_code(response)))
return(NULL)
}
}
# 獲取多個城市的天氣
get_multiple_cities_weather <- function(cities, api_key) {
weather_list <- list()
for (city in cities) {
cat("獲取", city, "的天氣數據...\n")
weather <- get_weather_data(city, api_key)
if (!is.null(weather)) {
weather_list[[city]] <- weather
}
# 避免API限制
Sys.sleep(1)
}
# 轉換為數據框
weather_df <- do.call(rbind, lapply(weather_list, as.data.frame))
return(weather_df)
}
# 示例使用
# api_key <- "your_openweathermap_api_key"
# cities <- c("Beijing", "Shanghai", "Guangzhou", "Shenzhen")
# weather_data <- get_multiple_cities_weather(cities, api_key)
3.2 新聞API數據獲取
# 使用NewsAPI獲取新聞數據
# 注意:需要註冊獲取API密鑰
fetch_news <- function(query = NULL, category = NULL, country = "us",
page_size = 10, api_key) {
base_url <- "https://newsapi.org/v2/top-headlines"
params <- list(
apiKey = api_key,
pageSize = page_size
)
# 添加查詢參數
if (!is.null(query)) {
params$q <- query
}
if (!is.null(category)) {
params$category <- category
}
if (!is.null(country)) {
params$country <- country
}
response <- GET(base_url, query = params)
if (status_code(response) == 200) {
news_data <- content(response, "parsed")
if (news_data$totalResults > 0) {
# 提取文章信息
articles <- lapply(news_data$articles, function(article) {
data.frame(
source = article$source$name,
author = ifelse(is.null(article$author), "", article$author),
title = article$title,
description = ifelse(is.null(article$description), "", article$description),
url = article$url,
published_at = article$publishedAt,
stringsAsFactors = FALSE
)
})
articles_df <- do.call(rbind, articles)
return(articles_df)
} else {
cat("未找到相關新聞\n")
return(NULL)
}
} else {
warning(paste("獲取新聞失敗,狀態碼:", status_code(response)))
return(NULL)
}
}
# 新聞數據分析函數
analyze_news <- function(news_df) {
if (is.null(news_df)) return(NULL)
cat("=== 新聞數據分析 ===\n")
# 按來源統計
source_stats <- news_df %>%
group_by(source) %>%
summarize(
article_count = n(),
latest_article = max(published_at)
) %>%
arrange(desc(article_count))
cat("\n1. 新聞來源分佈:\n")
print(source_stats)
# 提取關鍵詞(簡單示例)
extract_keywords <- function(text) {
words <- unlist(strsplit(tolower(text), "\\W+"))
words <- words[nchar(words) > 3]
words <- words[!words %in% stopwords::stopwords("en")]
return(words)
}
# 分析標題中的常見詞
all_titles <- paste(news_df$title, collapse = " ")
title_words <- extract_keywords(all_titles)
word_freq <- sort(table(title_words), decreasing = TRUE)
cat("\n2. 標題熱門關鍵詞(前10):\n")
print(head(word_freq, 10))
return(list(
source_stats = source_stats,
word_freq = word_freq
))
}
# 示例使用
# api_key <- "your_newsapi_key"
# tech_news <- fetch_news(query = "technology", page_size = 20, api_key = api_key)
# if (!is.null(tech_news)) {
# analysis <- analyze_news(tech_news)
# }
3.3 金融數據API
# 使用Alpha Vantage獲取股票數據
# 注意:需要註冊獲取API密鑰
get_stock_data <- function(symbol, api_key, output_size = "compact") {
base_url <- "https://www.alphavantage.co/query"
params <- list(
function = "TIME_SERIES_DAILY",
symbol = symbol,
outputsize = output_size,
apikey = api_key
)
response <- GET(base_url, query = params)
if (status_code(response) == 200) {
stock_data <- content(response, "parsed")
# 檢查API返回的錯誤信息
if (!is.null(stock_data$`Error Message`)) {
warning(stock_data$`Error Message`)
return(NULL)
}
if (!is.null(stock_data$`Note`)) {
warning("API調用頻率限制:", stock_data$`Note`)
return(NULL)
}
# 提取時間序列數據
time_series <- stock_data$`Time Series (Daily)`
if (is.null(time_series)) {
warning("未找到股票數據")
return(NULL)
}
# 轉換為數據框
dates <- names(time_series)
stock_list <- list()
for (date in dates) {
daily_data <- time_series[[date]]
stock_list[[date]] <- data.frame(
date = as.Date(date),
open = as.numeric(daily_data$`1. open`),
high = as.numeric(daily_data$`2. high`),
low = as.numeric(daily_data$`3. low`),
close = as.numeric(daily_data$`4. close`),
volume = as.numeric(daily_data$`5. volume`),
stringsAsFactors = FALSE
)
}
stock_df <- do.call(rbind, stock_list)
rownames(stock_df) <- NULL
# 按日期排序
stock_df <- stock_df[order(stock_df$date), ]
# 計算技術指標
stock_df$returns <- c(NA, diff(log(stock_df$close)))
stock_df$sma_20 <- TTR::SMA(stock_df$close, n = 20)
stock_df$sma_50 <- TTR::SMA(stock_df$close, n = 50)
return(stock_df)
} else {
warning(paste("獲取股票數據失敗,狀態碼:", status_code(response)))
return(NULL)
}
}
# 多股票數據獲取
get_multiple_stocks <- function(symbols, api_key) {
all_stocks <- list()
for (symbol in symbols) {
cat("獲取", symbol, "的數據...\n")
stock_data <- get_stock_data(symbol, api_key)
if (!is.null(stock_data)) {
stock_data$symbol <- symbol
all_stocks[[symbol]] <- stock_data
}
# Alpha Vantage有嚴格的頻率限制(5次/分鐘)
Sys.sleep(13) # 等待13秒以避免限制
}
# 合併所有股票數據
combined_df <- do.call(rbind, all_stocks)
return(combined_df)
}
# 股票數據分析
analyze_stocks <- function(stocks_df) {
if (is.null(stocks_df)) return(NULL)
cat("=== 股票數據分析 ===\n")
# 按股票分組分析
stock_summary <- stocks_df %>%
group_by(symbol) %>%
summarize(
start_date = min(date),
end_date = max(date),
days = n(),
start_price = first(close),
end_price = last(close),
total_return = (end_price - start_price) / start_price * 100,
avg_daily_volume = mean(volume, na.rm = TRUE),
volatility = sd(returns, na.rm = TRUE) * sqrt(252) * 100, # 年化波動率
max_drawdown = min(returns, na.rm = TRUE) * 100
) %>%
arrange(desc(total_return))
cat("\n1. 股票表現彙總:\n")
print(stock_summary)
# 相關性分析
library(tidyr)
# 創建收益矩陣
returns_matrix <- stocks_df %>%
select(symbol, date, returns) %>%
pivot_wider(names_from = symbol, values_from = returns) %>%
select(-date) %>%
as.matrix()
# 計算相關性矩陣
cor_matrix <- cor(returns_matrix, use = "complete.obs")
cat("\n2. 股票收益相關性矩陣:\n")
print(round(cor_matrix, 3))
return(list(
summary = stock_summary,
correlation = cor_matrix
))
}
# 示例使用
# api_key <- "your_alphavantage_api_key"
# symbols <- c("AAPL", "MSFT", "GOOGL")
# stocks_data <- get_multiple_stocks(symbols, api_key)
# if (!is.null(stocks_data)) {
# analysis <- analyze_stocks(stocks_data)
# }
🖥️ 第四部分:動態網頁抓取
4.1 RSelenium基礎
# RSelenium用於處理JavaScript渲染的動態網頁
# 首先需要安裝Java和Docker,或者使用standalone server
# 安裝RSelenium
install.packages("RSelenium")
library(RSelenium)
# 啓動Selenium服務器(方法1:使用Docker)
# 在終端運行: docker run -d -p 4445:4444 selenium/standalone-chrome
# 方法2:使用RSelenium內置方法
# 檢查是否已安裝Java
# system("java -version")
# 啓動Selenium服務器
rD <- rsDriver(
browser = "chrome",
chromever = "latest", # 或指定版本號,如"114.0.5735.90"
port = 4445L,
verbose = FALSE
)
# 獲取客户端
remDr <- rD[["client"]]
# 訪問網頁
remDr$navigate("https://www.example.com")
# 獲取頁面標題
title <- remDr$getTitle()
cat("頁面標題:", title, "\n")
# 獲取頁面源代碼
page_source <- remDr$getPageSource()[[1]]
# 關閉連接
remDr$close()
rD[["server"]]$stop()
4.2 動態網頁交互
# 啓動Selenium
rD <- rsDriver(browser = "chrome", port = 4445L, verbose = FALSE)
remDr <- rD[["client"]]
# 訪問示例網站
remDr$navigate("https://www.r-project.org/")
# 查找元素
# 通過ID查找
try({
search_box <- remDr$findElement(using = "id", "searchfield")
cat("找到搜索框\n")
})
# 通過CSS選擇器查找
try({
links <- remDr$findElements(using = "css selector", "a")
cat("找到", length(links), "個鏈接\n")
})
# 通過XPath查找
try({
heading <- remDr$findElement(using = "xpath", "//h1")
cat("找到標題:", heading$getElementText()[[1]], "\n")
})
# 與元素交互
# 輸入文本
try({
search_box$sendKeysToElement(list("ggplot2", key = "enter"))
Sys.sleep(2) # 等待頁面加載
})
# 點擊元素
try({
first_result <- remDr$findElement(using = "css selector", ".gs-title a")
first_result$clickElement()
Sys.sleep(2)
})
# 執行JavaScript
remDr$executeScript("return document.title;")
# 滾動頁面
remDr$executeScript("window.scrollTo(0, document.body.scrollHeight);")
Sys.sleep(1)
remDr$executeScript("window.scrollTo(0, 0);")
# 截圖
remDr$screenshot(file = "screenshot.png")
# 關閉
remDr$close()
rD[["server"]]$stop()
4.3 處理登錄和表單
# 處理需要登錄的網站
# 注意:這裏使用示例網站,實際網站可能不同
# 啓動Selenium
rD <- rsDriver(browser = "chrome", port = 4445L, verbose = FALSE)
remDr <- rD[["client"]]
# 訪問登錄頁面
login_url <- "https://example.com/login" # 示例URL
remDr$navigate(login_url)
Sys.sleep(2)
# 填寫登錄表單
try({
# 查找用户名輸入框
username_input <- remDr$findElement(using = "id", "username")
username_input$sendKeysToElement(list("your_username"))
# 查找密碼輸入框
password_input <- remDr$findElement(using = "id", "password")
password_input$sendKeysToElement(list("your_password"))
# 查找登錄按鈕並點擊
login_button <- remDr$findElement(using = "css selector", "button[type='submit']")
login_button$clickElement()
Sys.sleep(3) # 等待登錄完成
# 檢查是否登錄成功
current_url <- remDr$getCurrentUrl()[[1]]
if (!grepl("login", current_url)) {
cat("登錄成功\n")
} else {
cat("登錄可能失敗\n")
}
})
# 處理其他表單
# 填寫搜索表單
try({
search_input <- remDr$findElement(using = "name", "q")
search_input$clearElement()
search_input$sendKeysToElement(list("data science", key = "enter"))
Sys.sleep(2)
})
# 處理下拉菜單
try({
dropdown <- remDr$findElement(using = "id", "dropdown")
dropdown$clickElement()
Sys.sleep(0.5)
option <- remDr$findElement(using = "xpath", "//option[@value='option2']")
option$clickElement()
})
# 處理多選框
try({
checkbox <- remDr$findElement(using = "id", "agree_terms")
if (!checkbox$isElementSelected()[[1]]) {
checkbox$clickElement()
}
})
# 關閉
remDr$close()
rD[["server"]]$stop()
🛡️ 第五部分:高級技巧與最佳實踐
5.1 代理和用户代理設置
# 設置用户代理
user_agent <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
# 使用httr設置用户代理
response <- GET(
"https://httpbin.org/user-agent",
add_headers(`User-Agent` = user_agent)
)
cat(content(response, "text"))
# 使用代理
# 設置代理服務器
proxy_url <- "http://proxy.example.com:8080"
# 方法1:使用httr的config
response <- GET(
"https://httpbin.org/ip",
use_proxy(url = proxy_url, username = "user", password = "pass")
)
# 方法2:設置系統代理
# Sys.setenv(http_proxy = proxy_url)
# Sys.setenv(https_proxy = proxy_url)
# 使用rvest通過代理
session <- session(
"https://httpbin.org/ip",
httr::use_proxy(url = proxy_url)
)
# 輪換用户代理和代理
user_agents <- c(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
)
proxies <- c(
"http://proxy1.example.com:8080",
"http://proxy2.example.com:8080",
"http://proxy3.example.com:8080"
)
# 輪換請求
make_rotating_request <- function(url, user_agents, proxies) {
# 隨機選擇用户代理和代理
ua <- sample(user_agents, 1)
proxy <- sample(proxies, 1)
cat("使用用户代理:", ua, "\n")
cat("使用代理:", proxy, "\n")
tryCatch({
response <- GET(
url,
add_headers(`User-Agent` = ua),
use_proxy(url = proxy),
timeout(10)
)
return(content(response, "text"))
}, error = function(e) {
warning(paste("請求失敗:", e$message))
return(NULL)
})
}
5.2 處理驗證碼和反爬機制
# 處理驗證碼的常見策略
# 注意:自動破解驗證碼可能違反網站條款,請謹慎使用
# 1. 使用驗證碼識別服務(需要付費API)
solve_captcha <- function(image_url, api_key) {
# 這裏使用2Captcha示例
# 實際使用時需要根據驗證碼服務商的API文檔
solve_url <- "http://2captcha.com/in.php"
result_url <- "http://2captcha.com/res.php"
# 提交驗證碼
submit_params <- list(
key = api_key,
method = "base64",
body = image_url, # 這裏應該是base64編碼的圖像
json = 1
)
submit_response <- POST(solve_url, body = submit_params)
submit_data <- content(submit_response, "parsed")
if (submit_data$status == 1) {
request_id <- submit_data$request
# 等待並獲取結果
for (i in 1:30) { # 最多嘗試30次
Sys.sleep(5)
result_params <- list(
key = api_key,
action = "get",
id = request_id,
json = 1
)
result_response <- GET(result_url, query = result_params)
result_data <- content(result_response, "parsed")
if (result_data$status == 1) {
return(result_data$request) # 返回驗證碼文本
}
}
}
return(NULL)
}
# 2. 避免觸發反爬機制
avoid_anti_scraping <- function() {
# 隨機延遲
delay <- runif(1, 1, 5)
Sys.sleep(delay)
# 模擬人類行為
# 隨機滾動
if (runif(1) > 0.7) {
scroll_amount <- sample(100:500, 1)
remDr$executeScript(paste0("window.scrollBy(0, ", scroll_amount, ");"))
Sys.sleep(runif(1, 0.5, 2))
}
# 隨機移動鼠標(在Selenium中)
# 實際實現需要更復雜的代碼
}
# 3. 使用會話保持登錄狀態
maintain_session <- function() {
# 創建會話
s <- session("https://example.com/login")
# 登錄
login_form <- list(
username = "your_username",
password = "your_password"
)
s <- s %>%
session_submit(
form = login_form,
submit = "login_button" # 表單提交按鈕
)
# 使用同一個會話進行後續請求
s <- s %>%
session_jump_to("https://example.com/protected_page")
return(s)
}
5.3 數據存儲和調度
# 將抓取的數據存儲到數據庫
library(DBI)
library(RSQLite)
# 創建數據庫存儲爬蟲數據
setup_crawler_db <- function(db_path = "crawler_data.db") {
con <- dbConnect(RSQLite::SQLite(), db_path)
# 創建網頁數據表
dbExecute(con, "
CREATE TABLE IF NOT EXISTS webpage_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
title TEXT,
content TEXT,
html TEXT,
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(url, crawl_time)
)
")
# 創建API數據表
dbExecute(con, "
CREATE TABLE IF NOT EXISTS api_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
api_endpoint TEXT NOT NULL,
parameters TEXT,
response_data TEXT,
call_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
")
# 創建日誌表
dbExecute(con, "
CREATE TABLE IF NOT EXISTS crawl_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
task_name TEXT,
url TEXT,
status TEXT,
message TEXT,
duration REAL,
log_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
")
dbDisconnect(con)
}
# 存儲網頁數據
store_webpage_data <- function(url, title, content, html, db_path = "crawler_data.db") {
con <- dbConnect(RSQLite::SQLite(), db_path)
on.exit(dbDisconnect(con))
# 防止SQL注入
safe_url <- dbQuoteString(con, url)
safe_title <- dbQuoteString(con, title)
safe_content <- dbQuoteString(con, content)
safe_html <- dbQuoteString(con, html)
sql <- sprintf("
INSERT INTO webpage_data (url, title, content, html)
VALUES (%s, %s, %s, %s)",
safe_url, safe_title, safe_content, safe_html
)
dbExecute(con, sql)
}
# 記錄爬蟲日誌
log_crawl <- function(task_name, url, status, message, duration,
db_path = "crawler_data.db") {
con <- dbConnect(RSQLite::SQLite(), db_path)
on.exit(dbDisconnect(con))
sql <- sprintf("
INSERT INTO crawl_log (task_name, url, status, message, duration)
VALUES ('%s', '%s', '%s', '%s', %f)",
task_name, url, status, message, duration
)
dbExecute(con, sql)
}
# 定時爬蟲任務
schedule_crawler <- function() {
# 使用cronR包進行任務調度
install.packages("cronR")
library(cronR)
# 創建爬蟲腳本
crawler_script <- "
library(rvest)
library(httr)
source('crawler_functions.R')
# 執行爬蟲任務
result <- crawl_news_sites()
# 記錄結果
saveRDS(result, paste0('crawl_results_', Sys.Date(), '.rds'))
"
writeLines(crawler_script, "daily_crawler.R")
# 創建cron任務(每天凌晨2點運行)
cmd <- cron_rscript("daily_crawler.R")
cron_add(command = cmd, frequency = 'daily', at = '02:00',
id = 'daily_news_crawl', description = '每日新聞爬取')
# 列出所有任務
cron_ls()
}
🏭 第六部分:綜合實戰案例
案例1:房地產價格監控系統
# 房地產價格監控爬蟲
library(rvest)
library(httr)
library(dplyr)
library(DBI)
library(RSQLite)
# 配置數據庫
setup_realestate_db <- function() {
con <- dbConnect(RSQLite::SQLite(), "realestate.db")
# 創建房源表
dbExecute(con, "
CREATE TABLE IF NOT EXISTS properties (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT NOT NULL,
property_id TEXT UNIQUE,
title TEXT,
price REAL,
location TEXT,
area REAL,
bedrooms INTEGER,
bathrooms INTEGER,
property_type TEXT,
listing_date DATE,
url TEXT UNIQUE,
crawl_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
")
# 創建價格歷史表
dbExecute(con, "
CREATE TABLE IF NOT EXISTS price_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
property_id TEXT,
price REAL,
record_date DATE,
FOREIGN KEY (property_id) REFERENCES properties(property_id)
)
")
dbDisconnect(con)
}
# 鏈家爬蟲函數
crawl_lianjia <- function(city = "bj", district = NULL, max_pages = 3) {
base_url <- paste0("https://", city, ".lianjia.com/ershoufang/")
if (!is.null(district)) {
base_url <- paste0(base_url, district, "/")
}
all_properties <- list()
for (page in 1:max_pages) {
url <- paste0(base_url, "pg", page)
cat("抓取鏈家第", page, "頁:", url, "\n")
tryCatch({
# 設置請求頭
headers <- add_headers(
`User-Agent` = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
`Accept` = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
)
response <- GET(url, headers)
if (status_code(response) != 200) {
cat("請求失敗,狀態碼:", status_code(response), "\n")
break
}
webpage <- read_html(content(response, "text"))
# 提取房源列表
listings <- html_nodes(webpage, ".sellListContent li")
for (listing in listings) {
try({
# 提取房源信息
title <- html_text(html_node(listing, ".title a"))
price <- html_text(html_node(listing, ".totalPrice span"))
price <- as.numeric(gsub("[^0-9.]", "", price))
# 提取房屋信息
house_info <- html_text(html_node(listing, ".houseInfo"))
house_info_parts <- strsplit(house_info, "\\|")[[1]]
# 解析房屋信息
location <- trimws(house_info_parts[1])
layout <- trimws(house_info_parts[2])
area <- as.numeric(gsub("[^0-9.]", "", house_info_parts[3]))
# 提取户型信息
bedrooms <- as.numeric(gsub("室.*", "", gsub("[^0-9]", "", layout)))
bathrooms <- as.numeric(gsub(".*室|衞.*", "", gsub("[^0-9]", "", layout)))
# 提取鏈接
link <- html_attr(html_node(listing, ".title a"), "href")
property_id <- gsub(".*/|.html", "", link)
# 存儲數據
property_data <- data.frame(
source = "lianjia",
property_id = property_id,
title = title,
price = price,
location = location,
area = area,
bedrooms = bedrooms,
bathrooms = bathrooms,
property_type = "二手房",
url = link,
stringsAsFactors = FALSE
)
all_properties[[length(all_properties) + 1]] <- property_data
}, error = function(e) {
# 忽略單個房源的錯誤
})
}
# 延遲避免被屏蔽
Sys.sleep(runif(1, 1, 3))
}, error = function(e) {
cat("抓取第", page, "頁時出錯:", e$message, "\n")
})
}
# 合併所有數據
if (length(all_properties) > 0) {
properties_df <- do.call(rbind, all_properties)
return(properties_df)
} else {
return(NULL)
}
}
# 存儲到數據庫
store_properties <- function(properties_df, db_path = "realestate.db") {
con <- dbConnect(RSQLite::SQLite(), db_path)
on.exit(dbDisconnect(con))
# 檢查數據庫是否已存在相同property_id的記錄
existing_ids <- dbGetQuery(con, "SELECT property_id FROM properties")$property_id
new_properties <- properties_df[!properties_df$property_id %in% existing_ids, ]
existing_properties <- properties_df[properties_df$property_id %in% existing_ids, ]
# 插入新記錄
if (nrow(new_properties) > 0) {
dbWriteTable(con, "properties", new_properties, append = TRUE, row.names = FALSE)
cat("插入了", nrow(new_properties), "條新記錄\n")
}
# 更新現有記錄的價格
if (nrow(existing_properties) > 0) {
for (i in 1:nrow(existing_properties)) {
prop <- existing_properties[i, ]
# 檢查價格是否有變化
old_price <- dbGetQuery(con,
sprintf("SELECT price FROM properties WHERE property_id = '%s'", prop$property_id)
)$price
if (!is.na(old_price) && old_price != prop$price) {
# 更新價格
dbExecute(con, sprintf("
UPDATE properties
SET price = %.2f, last_updated = CURRENT_TIMESTAMP
WHERE property_id = '%s'",
prop$price, prop$property_id
))
# 記錄價格歷史
dbExecute(con, sprintf("
INSERT INTO price_history (property_id, price, record_date)
VALUES ('%s', %.2f, DATE('now'))",
prop$property_id, prop$price
))
cat("更新了房源", prop$property_id, "的價格: ", old_price, " -> ", prop$price, "\n")
}
}
}
}
# 數據分析函數
analyze_realestate <- function(db_path = "realestate.db") {
con <- dbConnect(RSQLite::SQLite(), db_path)
on.exit(dbDisconnect(con))
cat("=== 房地產數據分析 ===\n\n")
# 1. 基本統計
basic_stats <- dbGetQuery(con, "
SELECT
COUNT(*) as total_properties,
COUNT(DISTINCT location) as distinct_locations,
AVG(price) as avg_price,
MIN(price) as min_price,
MAX(price) as max_price,
AVG(area) as avg_area,
AVG(price/area) as avg_price_per_sqm
FROM properties
")
cat("1. 基本統計:\n")
print(basic_stats)
# 2. 價格分佈
price_distribution <- dbGetQuery(con, "
SELECT
CASE
WHEN price < 100 THEN '低於100萬'
WHEN price < 300 THEN '100-300萬'
WHEN price < 500 THEN '300-500萬'
WHEN price < 1000 THEN '500-1000萬'
ELSE '1000萬以上'
END as price_range,
COUNT(*) as property_count,
AVG(price) as avg_price_in_range,
AVG(area) as avg_area_in_range
FROM properties
GROUP BY price_range
ORDER BY MIN(price)
")
cat("\n2. 價格分佈:\n")
print(price_distribution)
# 3. 區域分析
area_analysis <- dbGetQuery(con, "
SELECT
location,
COUNT(*) as property_count,
AVG(price) as avg_price,
AVG(area) as avg_area,
AVG(price/area) as avg_price_per_sqm
FROM properties
GROUP BY location
HAVING COUNT(*) >= 5
ORDER BY avg_price_per_sqm DESC
LIMIT 10
")
cat("\n3. 最貴區域(單價前10):\n")
print(area_analysis)
# 4. 户型分析
layout_analysis <- dbGetQuery(con, "
SELECT
bedrooms,
bathrooms,
COUNT(*) as property_count,
AVG(price) as avg_price,
AVG(area) as avg_area
FROM properties
WHERE bedrooms IS NOT NULL AND bathrooms IS NOT NULL
GROUP BY bedrooms, bathrooms
ORDER BY bedrooms, bathrooms
")
cat("\n4. 户型分析:\n")
print(layout_analysis)
return(list(
basic_stats = basic_stats,
price_distribution = price_distribution,
area_analysis = area_analysis,
layout_analysis = layout_analysis
))
}
# 主函數
main_realestate_crawler <- function() {
# 初始化數據庫
setup_realestate_db()
# 抓取數據
cat("開始抓取鏈家數據...\n")
properties <- crawl_lianjia(city = "bj", max_pages = 2)
if (!is.null(properties)) {
cat("成功抓取", nrow(properties), "條房源數據\n")
# 存儲數據
store_properties(properties)
# 分析數據
analysis <- analyze_realestate()
# 生成報告
generate_report(analysis)
} else {
cat("抓取失敗\n")
}
}
# 生成HTML報告
generate_report <- function(analysis, output_file = "realestate_report.html") {
library(knitr)
library(rmarkdown)
report_template <- "
# 房地產價格監控報告
生成時間: `r Sys.time()`
## 1. 數據概覽
- 總房源數: `r analysis$basic_stats$total_properties`
- 平均價格: `r round(analysis$basic_stats$avg_price, 2)` 萬元
- 平均面積: `r round(analysis$basic_stats$avg_area, 2)` 平方米
- 平均單價: `r round(analysis$basic_stats$avg_price_per_sqm, 2)` 萬元/平方米
## 2. 價格分佈
```{r, echo=FALSE}
knitr::kable(analysis$price_distribution)
3. 區域分析
knitr::kable(analysis$area_analysis)
4. 户型分析
knitr::kable(analysis$layout_analysis)
"
writeLines(report_template, "report_template.Rmd") rmarkdown::render("report_template.Rmd", output_file = output_file)
cat("報告已生成:", output_file, "\n") }
運行主函數
main_realestate_crawler()
### 案例2:社交媒體監控系統
```r
# 社交媒體監控系統
library(httr)
library(jsonlite)
library(dplyr)
library(lubridate)
# 配置多個社交媒體API
# 注意:以下代碼需要相應的API密鑰
# Twitter API v2監控
monitor_twitter <- function(query, api_key, max_results = 100) {
base_url <- "https://api.twitter.com/2/tweets/search/recent"
headers <- add_headers(
`Authorization` = paste("Bearer", api_key)
)
params <- list(
query = query,
max_results = max_results,
"tweet.fields" = "created_at,public_metrics,entities",
"user.fields" = "username,name"
)
response <- GET(base_url, headers, query = params)
if (status_code(response) == 200) {
tweets_data <- content(response, "parsed")
if (!is.null(tweets_data$data)) {
# 處理推文數據
tweets_list <- lapply(tweets_data$data, function(tweet) {
data.frame(
platform = "Twitter",
tweet_id = tweet$id,
text = tweet$text,
created_at = tweet$created_at,
retweet_count = tweet$public_metrics$retweet_count,
reply_count = tweet$public_metrics$reply_count,
like_count = tweet$public_metrics$like_count,
quote_count = tweet$public_metrics$quote_count,
stringsAsFactors = FALSE
)
})
tweets_df <- do.call(rbind, tweets_list)
return(tweets_df)
}
}
return(NULL)
}
# Reddit API監控
monitor_reddit <- function(subreddit, query = NULL, limit = 100) {
base_url <- paste0("https://www.reddit.com/r/", subreddit, "/new.json")
params <- list(limit = limit)
if (!is.null(query)) {
base_url <- paste0("https://www.reddit.com/r/", subreddit, "/search.json")
params$q = query
params$restrict_sr = "on"
params$sort = "new"
}
headers <- add_headers(
`User-Agent` = "SocialMediaMonitor/1.0"
)
response <- GET(base_url, headers, query = params)
if (status_code(response) == 200) {
reddit_data <- content(response, "parsed")
posts_list <- lapply(reddit_data$data$children, function(post) {
post_data <- post$data
data.frame(
platform = "Reddit",
post_id = post_data$id,
title = post_data$title,
text = post_data$selftext,
subreddit = post_data$subreddit,
author = post_data$author,
created_utc = as.POSIXct(post_data$created_utc, origin = "1970-01-01"),
score = post_data$score,
num_comments = post_data$num_comments,
upvote_ratio = post_data$upvote_ratio,
url = post_data$url,
stringsAsFactors = FALSE
)
})
posts_df <- do.call(rbind, posts_list)
return(posts_df)
}
return(NULL)
}
# 情感分析函數
analyze_sentiment <- function(texts) {
# 簡單的情感分析(基於詞典)
# 實際應用中可以使用更復雜的NLP模型
positive_words <- c("好", "優秀", "棒", "贊", "喜歡", "愛", "高興", "開心",
"完美", "精彩", "推薦", "支持", "感謝", "棒極了")
negative_words <- c("差", "糟糕", "爛", "討厭", "恨", "生氣", "傷心",
"失望", "垃圾", "問題", "投訴", "不好", "不行")
sentiments <- sapply(texts, function(text) {
text_lower <- tolower(text)
pos_count <- sum(sapply(positive_words, function(word) {
grepl(word, text_lower)
}))
neg_count <- sum(sapply(negative_words, function(word) {
grepl(word, text_lower)
}))
if (pos_count > neg_count) {
return("積極")
} else if (neg_count > pos_count) {
return("消極")
} else {
return("中性")
}
})
return(sentiments)
}
# 趨勢分析
analyze_trends <- function(social_data, time_window = "hour") {
if (is.null(social_data) || nrow(social_data) == 0) {
return(NULL)
}
# 確保有時間戳列
if (!"created_at" %in% names(social_data)) {
if ("created_utc" %in% names(social_data)) {
social_data$created_at <- social_data$created_utc
} else {
social_data$created_at <- Sys.time()
}
}
# 按時間窗口分組
social_data$time_group <- floor_date(social_data$created_at, time_window)
trends <- social_data %>%
group_by(platform, time_group) %>%
summarize(
post_count = n(),
avg_score = mean(score, na.rm = TRUE),
total_comments = sum(num_comments, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(time_group)
return(trends)
}
# 關鍵詞提取
extract_keywords <- function(texts, n = 10) {
all_text <- paste(texts, collapse = " ")
# 簡單的關鍵詞提取(基於詞頻)
words <- unlist(strsplit(all_text, "\\W+"))
words <- words[nchar(words) > 1]
words <- tolower(words)
# 移除停用詞
stopwords_ch <- c("的", "了", "在", "是", "我", "有", "和", "就",
"不", "人", "都", "一", "一個", "上", "也", "很",
"到", "説", "要", "去", "你", "會", "着", "沒有",
"看", "好", "自己", "這")
words <- words[!words %in% stopwords_ch]
word_freq <- sort(table(words), decreasing = TRUE)
return(head(word_freq, n))
}
# 主監控函數
social_media_monitor <- function(queries, duration_hours = 24) {
all_data <- list()
# 配置API密鑰(實際使用時從環境變量或配置文件讀取)
# twitter_api_key <- Sys.getenv("TWITTER_API_KEY")
# reddit_client_id <- Sys.getenv("REDDIT_CLIENT_ID")
# reddit_client_secret <- Sys.getenv("REDDIT_CLIENT_SECRET")
for (query in queries) {
cat("監控關鍵詞:", query, "\n")
# 監控Twitter
cat(" 獲取Twitter數據...\n")
twitter_data <- monitor_twitter(query, api_key = "dummy_key", max_results = 50)
if (!is.null(twitter_data)) {
twitter_data$query <- query
twitter_data$sentiment <- analyze_sentiment(twitter_data$text)
all_data[["twitter"]] <- rbind(all_data[["twitter"]], twitter_data)
}
# 監控Reddit
cat(" 獲取Reddit數據...\n")
reddit_data <- monitor_reddit("all", query = query, limit = 50)
if (!is.null(reddit_data)) {
reddit_data$query <- query
reddit_data$sentiment <- analyze_sentiment(paste(reddit_data$title, reddit_data$text))
all_data[["reddit"]] <- rbind(all_data[["reddit"]], reddit_data)
}
# 延遲避免API限制
Sys.sleep(2)
}
# 合併所有數據
combined_data <- do.call(rbind, all_data)
if (!is.null(combined_data) && nrow(combined_data) > 0) {
# 分析趨勢
trends <- analyze_trends(combined_data, "hour")
# 提取熱門關鍵詞
all_texts <- c(
if (!is.null(all_data[["twitter"]])) all_data[["twitter"]]$text,
if (!is.null(all_data[["reddit"]])) paste(all_data[["reddit"]]$title, all_data[["reddit"]]$text)
)
keywords <- extract_keywords(all_texts, 20)
# 情感分佈
sentiment_dist <- if (!is.null(combined_data$sentiment)) {
table(combined_data$sentiment)
} else {
NULL
}
return(list(
raw_data = combined_data,
trends = trends,
keywords = keywords,
sentiment = sentiment_dist,
summary = list(
total_posts = nrow(combined_data),
platforms = unique(combined_data$platform),
time_range = range(combined_data$created_at, na.rm = TRUE)
)
))
}
return(NULL)
}
# 生成監控報告
generate_social_report <- function(monitor_results, output_file = "social_media_report.html") {
library(ggplot2)
library(plotly)
if (is.null(monitor_results)) {
cat("沒有數據可生成報告\n")
return(NULL)
}
# 創建報告
report <- list()
# 1. 摘要
report$summary <- monitor_results$summary
# 2. 趨勢圖表
if (!is.null(monitor_results$trends)) {
trend_plot <- ggplot(monitor_results$trends,
aes(x = time_group, y = post_count, color = platform)) +
geom_line() +
geom_point() +
labs(title = "社交媒體趨勢", x = "時間", y = "發帖數量") +
theme_minimal()
report$trend_plot <- trend_plot
}
# 3. 關鍵詞雲數據
if (!is.null(monitor_results$keywords)) {
keyword_df <- data.frame(
word = names(monitor_results$keywords),
freq = as.numeric(monitor_results$keywords)
)
report$keywords <- keyword_df
}
# 4. 情感分析
if (!is.null(monitor_results$sentiment)) {
sentiment_df <- data.frame(
sentiment = names(monitor_results$sentiment),
count = as.numeric(monitor_results$sentiment)
)
sentiment_plot <- ggplot(sentiment_df, aes(x = sentiment, y = count, fill = sentiment)) +
geom_bar(stat = "identity") +
labs(title = "情感分析", x = "情感", y = "數量") +
theme_minimal()
report$sentiment_plot <- sentiment_plot
}
# 5. 熱門內容
if (!is.null(monitor_results$raw_data)) {
top_posts <- monitor_results$raw_data %>%
arrange(desc(score)) %>%
head(10) %>%
select(platform, text, score, sentiment, created_at)
report$top_posts <- top_posts
}
# 保存報告
saveRDS(report, gsub(".html", ".rds", output_file))
# 生成HTML報告
html_report <- "
<!DOCTYPE html>
<html>
<head>
<title>社交媒體監控報告</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
.section { margin-bottom: 40px; border-bottom: 1px solid #ddd; padding-bottom: 20px; }
h1, h2 { color: #333; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
</style>
</head>
<body>
<h1>社交媒體監控報告</h1>
<p>生成時間: %s</p>
<div class='section'>
<h2>1. 數據摘要</h2>
<p>總髮帖數: %d</p>
<p>監控平台: %s</p>
<p>時間範圍: %s 到 %s</p>
</div>
</body>
</html>
"
html_content <- sprintf(html_report,
Sys.time(),
report$summary$total_posts,
paste(report$summary$platforms, collapse = ", "),
format(report$summary$time_range[1], "%Y-%m-%d %H:%M:%S"),
format(report$summary$time_range[2], "%Y-%m-%d %H:%M:%S"))
writeLines(html_content, output_file)
cat("報告已生成:", output_file, "\n")
return(report)
}
# 示例使用
# queries <- c("數據科學", "機器學習", "人工智能")
# results <- social_media_monitor(queries, duration_hours = 1)
# if (!is.null(results)) {
# report <- generate_social_report(results)
# }
💻 今日練習
練習1:簡單網頁爬蟲
# 1. 選擇任意新聞網站(如:新華網、人民網等)
# 2. 使用rvest抓取首頁新聞標題和鏈接
# 3. 提取每條新聞的發佈時間和摘要
# 4. 將數據保存到CSV文件
# 5. 添加錯誤處理和延遲,避免被屏蔽
練習2:API數據獲取
# 1. 註冊OpenWeatherMap免費API賬號
# 2. 編寫函數獲取指定城市的天氣數據
# 3. 獲取5個主要城市的天氣信息
# 4. 分析並比較這些城市的天氣狀況
# 5. 將結果可視化(温度、濕度、風速等)
練習3:動態內容抓取
# 1. 使用RSelenium訪問一個使用JavaScript渲染的網站
# 2. 模擬用户行為:搜索、翻頁、點擊等
# 3. 抓取至少3頁的動態內容
# 4. 將抓取的數據存儲到數據庫
# 5. 實現簡單的數據分析和報告生成
📌 今日總結
重點掌握:
- ✓ 使用rvest包進行靜態網頁爬蟲
- ✓ 使用httr包與API交互
- ✓ 網頁數據解析(CSS選擇器、XPath)
- ✓ 處理動態網頁(RSelenium)
- ✓ API認證和錯誤處理
網絡爬蟲注意事項:
- 遵守robots.txt:尊重網站的爬蟲政策
- 設置合理的延遲:避免對服務器造成壓力
- 處理異常:網絡請求可能失敗,要有錯誤處理
- 尊重版權:不要濫用爬取的數據
- 注意法律風險:某些數據可能受法律保護
API使用要點:
- 閲讀文檔:仔細閲讀API文檔,瞭解限制和用法
- 管理密鑰:安全存儲API密鑰,不要硬編碼在代碼中
- 處理限制:瞭解並遵守API的速率限制
- 錯誤處理:處理各種HTTP狀態碼
- 數據緩存:緩存API響應以減少請求次數
🎯 明日預告
第十三天:文本挖掘與自然語言處理
- 文本預處理技術
- 詞頻分析和TF-IDF
- 情感分析
- 主題建模(LDA)
- 文本分類
學習建議:
- 從簡單的靜態網站開始練習爬蟲
- 使用免費API進行實踐,如OpenWeatherMap、NewsAPI等
- 注意網絡爬蟲的倫理和法律問題
- 學習查看網頁源代碼,理解HTML結構
- 實際項目中,考慮使用現成的數據源或API,避免重複造輪子
記得保存今天的代碼腳本為day12.R,明天我們將進入文本挖掘的世界!📝