可以將結構化的數據文件映射為一張數據庫表,並提供簡單的 SQL 查詢功能,將 SQL 語句轉換為 MapReduce/Tez/Spark 任務運行。
- SQL接口:使用 HiveQL(類似 SQL)進行查詢
- 大數據處理:處理 PB 級別的數據
- 數據倉庫:適合離線批處理和數據倉庫建設
- 擴展性:可自定義 UDF、UDAF、UDTF
使用場景
- 數據倉庫建設
企業級數據倉庫(EDW)
數據湖查詢引擎
歷史數據分析
- ETL 處理
數據清洗和轉換
數據格式轉換
數據質量檢查
- 離線數據分析
每日/每週/月度報表
用户行為分析
業務指標計算
- 數據探索
數據科學家進行數據探索
即席查詢分析
Hive 實現"網站用户訪問量統計"
一、環境準備
Hive 表結構設計
-- 創建原始日誌表(存儲從Kafka或文件導入的原始數據)
CREATE TABLE IF NOT EXISTS page_view_logs (
log_id BIGINT COMMENT '日誌ID',
page_url STRING COMMENT '頁面URL',
user_id STRING COMMENT '用户ID',
event_type STRING COMMENT '事件類型',
server_time TIMESTAMP COMMENT '服務器時間',
client_time TIMESTAMP COMMENT '客户端時間',
ip_address STRING COMMENT 'IP地址',
user_agent STRING COMMENT '用户代理',
other_info STRING COMMENT '其他信息'
) COMMENT '頁面訪問日誌原始表'
PARTITIONED BY (dt STRING COMMENT '日期分區')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
-- 創建日統計結果表
CREATE TABLE IF NOT EXISTS page_view_daily_stats (
stat_date STRING COMMENT '統計日期',
page_url STRING COMMENT '頁面URL',
pv_count BIGINT COMMENT '頁面訪問量',
uv_count BIGINT COMMENT '獨立用户數',
avg_pv_per_user DOUBLE COMMENT '人均訪問次數',
peak_hour INT COMMENT '訪問高峯小時',
create_time TIMESTAMP COMMENT '創建時間'
) COMMENT '頁面訪問日統計表'
PARTITIONED BY (stat_month STRING COMMENT '月份分區')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;
-- 創建小時統計表(用於近實時分析)
CREATE TABLE IF NOT EXISTS page_view_hourly_stats (
stat_hour STRING COMMENT '統計小時',
page_url STRING COMMENT '頁面URL',
pv_count BIGINT COMMENT '頁面訪問量',
uv_count BIGINT COMMENT '獨立用户數',
create_time TIMESTAMP COMMENT '創建時間'
) COMMENT '頁面訪問小時統計表'
PARTITIONED BY (stat_date STRING COMMENT '日期分區')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;
二、數據導入腳本
從HDFS導入數據
-- 加載數據到原始日誌表
LOAD DATA INPATH '/user/hive/warehouse/logs/page_views_20240101.log'
INTO TABLE page_view_logs
PARTITION (dt='2024-01-01');
-- 或者使用外部表方式(推薦)
CREATE EXTERNAL TABLE IF NOT EXISTS page_view_logs_external (
log_id BIGINT,
page_url STRING,
user_id STRING,
event_type STRING,
server_time TIMESTAMP,
client_time TIMESTAMP,
ip_address STRING,
user_agent STRING,
other_info STRING
) COMMENT '頁面訪問日誌外部表'
PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LOCATION '/user/hive/warehouse/logs/';
-- 修復分區
MSCK REPAIR TABLE page_view_logs_external;
三、核心數據分析腳本
日統計數據分析
-- 每日頁面訪問統計
INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01')
SELECT
dt as stat_date,
page_url,
COUNT(*) as pv_count, -- 總訪問量
COUNT(DISTINCT user_id) as uv_count, -- 獨立用户數
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
-- 計算訪問高峯小時
CAST(SUBSTR(peak_hour_data.peak_hour, 1, 2) AS INT) as peak_hour,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
LATERAL VIEW (
-- 計算每個頁面訪問量最高的小時
SELECT
CONCAT(stat_hour, ':00') as peak_hour
FROM (
SELECT
page_url as inner_page_url,
DATE_FORMAT(server_time, 'HH') as stat_hour,
COUNT(*) as hour_pv,
ROW_NUMBER() OVER (PARTITION BY page_url ORDER BY COUNT(*) DESC) as rn
FROM page_view_logs
WHERE dt = '2024-01-01'
GROUP BY page_url, DATE_FORMAT(server_time, 'HH')
) t
WHERE t.inner_page_url = page_view_logs.page_url AND rn = 1
) peak_hour_data
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
GROUP BY dt, page_url, peak_hour_data.peak_hour;
-- 查詢統計結果
SELECT * FROM page_view_daily_stats
WHERE stat_month='2024-01'
ORDER BY pv_count DESC
LIMIT 10;
小時級統計(近實時)
-- 每小時頁面訪問統計
INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='2024-01-01')
SELECT
CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') as stat_hour,
page_url,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
AND server_time >= '2024-01-01 00:00:00'
AND server_time < '2024-01-02 00:00:00'
GROUP BY
DATE_FORMAT(server_time, 'yyyy-MM-dd HH'),
page_url;
-- 查詢小時統計結果
SELECT * FROM page_view_hourly_stats
WHERE stat_date='2024-01-01'
ORDER BY stat_hour, pv_count DESC;
四、高級分析腳本
用户行為路徑分析
-- 用户訪問路徑分析
WITH user_sessions AS (
SELECT
user_id,
page_url,
server_time,
LAG(page_url) OVER (PARTITION BY user_id ORDER BY server_time) as prev_page,
LEAD(page_url) OVER (PARTITION BY user_id ORDER BY server_time) as next_page
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
),
page_transitions AS (
SELECT
COALESCE(prev_page, 'ENTRY') as from_page,
page_url as to_page,
COUNT(*) as transition_count
FROM user_sessions
GROUP BY COALESCE(prev_page, 'ENTRY'), page_url
)
SELECT
from_page,
to_page,
transition_count,
ROUND(transition_count * 100.0 / SUM(transition_count) OVER (PARTITION BY from_page), 2) as percentage
FROM page_transitions
ORDER BY from_page, transition_count DESC;
熱門頁面趨勢分析
-- 熱門頁面按小時趨勢
SELECT
page_url,
DATE_FORMAT(server_time, 'HH') as hour,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
AND page_url IN (
SELECT page_url
FROM page_view_daily_stats
WHERE stat_date = '2024-01-01'
ORDER BY pv_count DESC
LIMIT 5
)
GROUP BY page_url, DATE_FORMAT(server_time, 'HH')
ORDER BY page_url, hour;
五、Java程序調用Hive
Hive JDBC 連接示例
import java.sql.*;
public class HivePageViewAnalysis{
private static final String DRIVER_NAME = "org.apache.hive.jdbc.HiveDriver";
private static final String CONNECTION_URL = "jdbc:hive2://localhost:10000/analytics_db";
public static void main(String[] args) {
Connection connection = null;
Statement statement = null;
ResultSet resultSet = null;
try{
// 1. 加載驅動
Class.forName(DRIVER_NAME);
// 2. 建立連接
connection = DriverManager.getConnection(CONNECTION_URL, "hive", "");
statement = connection.createStatement();
// 3. 執行日統計任務
String dailyStatsSQL =
"INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01') " +
"SELECT " +
" dt as stat_date, " +
" page_url, " +
" COUNT(*) as pv_count, " +
" COUNT(DISTINCT user_id) as uv_count, " +
" ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user, " +
" 0 as peak_hour, " + // 簡化版本
" CURRENT_TIMESTAMP as create_time " +
"FROM page_view_logs " +
"WHERE dt = '2024-01-01' " +
" AND event_type = 'page_view' " +
"GROUP BY dt, page_url";
System.out.println("執行日統計任務...");
statement.execute(dailyStatsSQL);
// 4. 查詢統計結果
String querySQL =
"SELECT stat_date, page_url, pv_count, uv_count, avg_pv_per_user " +
"FROM page_view_daily_stats " +
"WHERE stat_month = '2024-01' " +
"ORDER BY pv_count DESC " +
"LIMIT 10";
resultSet = statement.executeQuery(querySQL);
// 5. 輸出結果
System.out.println("=== 頁面訪問統計TOP10 ===");
System.out.println("日期\t\t頁面URL\t\t訪問量\t獨立用户\t人均訪問");
System.out.println("------------------------------------------------------------");
while (resultSet.next()) {
String statDate = resultSet.getString("stat_date");
String pageUrl = resultSet.getString("page_url");
long pvCount = resultSet.getLong("pv_count");
long uvCount = resultSet.getLong("uv_count");
double avgPv = resultSet.getDouble("avg_pv_per_user");
System.out.printf("%s\t%s\t%d\t%d\t%.2f%n",
statDate, pageUrl, pvCount, uvCount, avgPv);
}
}catch(Exception e){
e.printStackTrace();
}finally{
// 關閉資源
try {
if (resultSet != null) resultSet.close();
if (statement != null) statement.close();
if (connection != null) connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
六、Shell 腳本調度
每日統計任務腳本
#!/bin/bash
# Hive 日統計任務腳本
# 使用方法: ./daily_pageview_stats.sh 2024-01-01
# 參數檢查
if [ $# -ne 1 ]; then
echo "Usage: $0 <date>"
echo "Example: $0 2024-01-01"
exit 1
fi
STAT_DATE=$1
STAT_MONTH=$(date -d "$STAT_DATE" +%Y-%m)
echo "開始執行頁面訪問統計任務,日期: $STAT_DATE"
# 執行 Hive SQL
hive -e "
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
-- 日統計任務
INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='$STAT_MONTH')
SELECT
dt as stat_date,
page_url,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
0 as peak_hour,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
AND event_type = 'page_view'
GROUP BY dt, page_url;
-- 小時統計任務
INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='$STAT_DATE')
SELECT
CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') as stat_hour,
page_url,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
AND event_type = 'page_view'
GROUP BY DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), page_url;
"
# 檢查執行結果
if [ $? -eq 0 ]; then
echo "頁面訪問統計任務執行成功"
# 發送通知(可選)
# send_notification "PageView Stats" "Daily stats completed for $STAT_DATE"
else
echo "頁面訪問統計任務執行失敗"
exit 1
fi
七、數據導出到 MySQL
Hive 到 MySQL 數據導出
-- 創建 Hive 外部表指向 MySQL(需要安裝 Hive MySQL connector)
CREATE EXTERNAL TABLE page_view_stats_mysql (
stat_date STRING,
page_url STRING,
pv_count BIGINT,
uv_count BIGINT,
avg_pv_per_user DOUBLE,
peak_hour INT,
create_time TIMESTAMP
)
STORED BY 'org.apache.hadoop.hive.mysql.storagehandler.MySQLStorageHandler'
TBLPROPERTIES (
"mysql.host" = "localhost",
"mysql.port" = "3306",
"mysql.database" = "analytics_db",
"mysql.table" = "page_view_stats",
"mysql.user" = "root",
"mysql.password" = "password"
);
-- 導出數據到 MySQL
INSERT OVERWRITE TABLE page_view_stats_mysql
SELECT
stat_date,
page_url,
pv_count,
uv_count,
avg_pv_per_user,
peak_hour,
create_time
FROM page_view_daily_stats
WHERE stat_month = '2024-01';
八、性能優化配置
Hive 調優參數
-- 在 Hive SQL 開頭設置優化參數
SET hive.exec.parallel = true;
SET hive.exec.parallel.thread.number = 8;
SET hive.auto.convert.join = true;
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles = true;
SET hive.merge.size.per.task = 256000000;
SET hive.merge.smallfiles.avgsize = 128000000;
SET hive.exec.compress.output = true;
SET mapred.output.compression.codec = org.apache.hadoop.io.compress.SnappyCodec;
運行步驟
- 準備 Hadoop 環境:確保 HDFS 和 Hive 服務正常運行
- 創建 Hive 表:執行上述表創建 SQL
- 導入數據:將日誌文件上傳到 HDFS 並加載到 Hive
- 執行分析任務:運行 Hive SQL 或 Java 程序
- 查看結果:在 Hive 或 MySQL 中查詢統計結果