可以將結構化的數據文件映射為一張數據庫表,並提供簡單的 SQL 查詢功能,將 SQL 語句轉換為 MapReduce/Tez/Spark 任務運行。

  • SQL接口:使用 HiveQL(類似 SQL)進行查詢
  • 大數據處理:處理 PB 級別的數據
  • 數據倉庫:適合離線批處理和數據倉庫建設
  • 擴展性:可自定義 UDF、UDAF、UDTF

使用場景

  1. 數據倉庫建設

企業級數據倉庫(EDW)
數據湖查詢引擎
歷史數據分析

  1. ETL 處理

數據清洗和轉換
數據格式轉換
數據質量檢查

  1. 離線數據分析

每日/每週/月度報表
用户行為分析
業務指標計算

  1. 數據探索

數據科學家進行數據探索
即席查詢分析

Hive 實現"網站用户訪問量統計"

一、環境準備

Hive 表結構設計

-- 創建原始日誌表(存儲從Kafka或文件導入的原始數據)
CREATE TABLE IF NOT EXISTS page_view_logs (
    log_id BIGINT COMMENT '日誌ID',
    page_url STRING COMMENT '頁面URL',
    user_id STRING COMMENT '用户ID',
    event_type STRING COMMENT '事件類型',
    server_time TIMESTAMP COMMENT '服務器時間',
    client_time TIMESTAMP COMMENT '客户端時間',
    ip_address STRING COMMENT 'IP地址',
    user_agent STRING COMMENT '用户代理',
    other_info STRING COMMENT '其他信息'
) COMMENT '頁面訪問日誌原始表'
PARTITIONED BY (dt STRING COMMENT '日期分區')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;

-- 創建日統計結果表
CREATE TABLE IF NOT EXISTS page_view_daily_stats (
    stat_date STRING COMMENT '統計日期',
    page_url STRING COMMENT '頁面URL',
    pv_count BIGINT COMMENT '頁面訪問量',
    uv_count BIGINT COMMENT '獨立用户數',
    avg_pv_per_user DOUBLE COMMENT '人均訪問次數',
    peak_hour INT COMMENT '訪問高峯小時',
    create_time TIMESTAMP COMMENT '創建時間'
) COMMENT '頁面訪問日統計表'
PARTITIONED BY (stat_month STRING COMMENT '月份分區')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;

-- 創建小時統計表(用於近實時分析)
CREATE TABLE IF NOT EXISTS page_view_hourly_stats (
    stat_hour STRING COMMENT '統計小時',
    page_url STRING COMMENT '頁面URL', 
    pv_count BIGINT COMMENT '頁面訪問量',
    uv_count BIGINT COMMENT '獨立用户數',
    create_time TIMESTAMP COMMENT '創建時間'
) COMMENT '頁面訪問小時統計表'
PARTITIONED BY (stat_date STRING COMMENT '日期分區')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;

二、數據導入腳本

從HDFS導入數據

-- 加載數據到原始日誌表
LOAD DATA INPATH '/user/hive/warehouse/logs/page_views_20240101.log' 
INTO TABLE page_view_logs 
PARTITION (dt='2024-01-01');

-- 或者使用外部表方式(推薦)
CREATE EXTERNAL TABLE IF NOT EXISTS page_view_logs_external (
    log_id BIGINT,
    page_url STRING,
    user_id STRING, 
    event_type STRING,
    server_time TIMESTAMP,
    client_time TIMESTAMP,
    ip_address STRING,
    user_agent STRING,
    other_info STRING
) COMMENT '頁面訪問日誌外部表'
PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LOCATION '/user/hive/warehouse/logs/';

-- 修復分區
MSCK REPAIR TABLE page_view_logs_external;

三、核心數據分析腳本

日統計數據分析

-- 每日頁面訪問統計
INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01')
SELECT 
    dt as stat_date,
    page_url,
    COUNT(*) as pv_count,  -- 總訪問量
    COUNT(DISTINCT user_id) as uv_count,  -- 獨立用户數
    ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
    -- 計算訪問高峯小時
    CAST(SUBSTR(peak_hour_data.peak_hour, 1, 2) AS INT) as peak_hour,
    CURRENT_TIMESTAMP as create_time
FROM page_view_logs
LATERAL VIEW (
    -- 計算每個頁面訪問量最高的小時
    SELECT 
        CONCAT(stat_hour, ':00') as peak_hour
    FROM (
        SELECT 
            page_url as inner_page_url,
            DATE_FORMAT(server_time, 'HH') as stat_hour,
            COUNT(*) as hour_pv,
            ROW_NUMBER() OVER (PARTITION BY page_url ORDER BY COUNT(*) DESC) as rn
        FROM page_view_logs 
        WHERE dt = '2024-01-01'
        GROUP BY page_url, DATE_FORMAT(server_time, 'HH')
    ) t 
    WHERE t.inner_page_url = page_view_logs.page_url AND rn = 1
) peak_hour_data
WHERE dt = '2024-01-01'
  AND event_type = 'page_view'
GROUP BY dt, page_url, peak_hour_data.peak_hour;

-- 查詢統計結果
SELECT * FROM page_view_daily_stats 
WHERE stat_month='2024-01' 
ORDER BY pv_count DESC 
LIMIT 10;

小時級統計(近實時)

-- 每小時頁面訪問統計
INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='2024-01-01')
SELECT 
    CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') as stat_hour,
    page_url,
    COUNT(*) as pv_count,
    COUNT(DISTINCT user_id) as uv_count,
    CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '2024-01-01'
  AND event_type = 'page_view'
  AND server_time >= '2024-01-01 00:00:00' 
  AND server_time < '2024-01-02 00:00:00'
GROUP BY 
    DATE_FORMAT(server_time, 'yyyy-MM-dd HH'),
    page_url;

-- 查詢小時統計結果
SELECT * FROM page_view_hourly_stats 
WHERE stat_date='2024-01-01' 
ORDER BY stat_hour, pv_count DESC;

四、高級分析腳本

用户行為路徑分析

-- 用户訪問路徑分析
WITH user_sessions AS (
    SELECT 
        user_id,
        page_url,
        server_time,
        LAG(page_url) OVER (PARTITION BY user_id ORDER BY server_time) as prev_page,
        LEAD(page_url) OVER (PARTITION BY user_id ORDER BY server_time) as next_page
    FROM page_view_logs
    WHERE dt = '2024-01-01'
      AND event_type = 'page_view'
),
page_transitions AS (
    SELECT 
        COALESCE(prev_page, 'ENTRY') as from_page,
        page_url as to_page,
        COUNT(*) as transition_count
    FROM user_sessions
    GROUP BY COALESCE(prev_page, 'ENTRY'), page_url
)
SELECT 
    from_page,
    to_page,
    transition_count,
    ROUND(transition_count * 100.0 / SUM(transition_count) OVER (PARTITION BY from_page), 2) as percentage
FROM page_transitions
ORDER BY from_page, transition_count DESC;

熱門頁面趨勢分析

-- 熱門頁面按小時趨勢
SELECT 
    page_url,
    DATE_FORMAT(server_time, 'HH') as hour,
    COUNT(*) as pv_count,
    COUNT(DISTINCT user_id) as uv_count
FROM page_view_logs
WHERE dt = '2024-01-01'
  AND event_type = 'page_view'
  AND page_url IN (
      SELECT page_url 
      FROM page_view_daily_stats 
      WHERE stat_date = '2024-01-01' 
      ORDER BY pv_count DESC 
      LIMIT 5
  )
GROUP BY page_url, DATE_FORMAT(server_time, 'HH')
ORDER BY page_url, hour;

五、Java程序調用Hive

Hive JDBC 連接示例

import java.sql.*;

public class HivePageViewAnalysis{
	
	private static final String DRIVER_NAME = "org.apache.hive.jdbc.HiveDriver";
	private static final String CONNECTION_URL = "jdbc:hive2://localhost:10000/analytics_db";

	public static void main(String[] args) {
		
		Connection connection = null;
        Statement statement = null;
        ResultSet resultSet = null;

		try{
			// 1. 加載驅動
            Class.forName(DRIVER_NAME);
            
            // 2. 建立連接
            connection = DriverManager.getConnection(CONNECTION_URL, "hive", "");
            statement = connection.createStatement();
            
            // 3. 執行日統計任務
            String dailyStatsSQL = 
                "INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01') " +
                "SELECT " +
                "    dt as stat_date, " +
                "    page_url, " +
                "    COUNT(*) as pv_count, " +
                "    COUNT(DISTINCT user_id) as uv_count, " +
                "    ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user, " +
                "    0 as peak_hour, " +  // 簡化版本
                "    CURRENT_TIMESTAMP as create_time " +
                "FROM page_view_logs " +
                "WHERE dt = '2024-01-01' " +
                "  AND event_type = 'page_view' " +
                "GROUP BY dt, page_url";
            
            System.out.println("執行日統計任務...");
            statement.execute(dailyStatsSQL);
            
            // 4. 查詢統計結果
            String querySQL = 
                "SELECT stat_date, page_url, pv_count, uv_count, avg_pv_per_user " +
                "FROM page_view_daily_stats " +
                "WHERE stat_month = '2024-01' " +
                "ORDER BY pv_count DESC " +
                "LIMIT 10";
            
            resultSet = statement.executeQuery(querySQL);
            
            // 5. 輸出結果
            System.out.println("=== 頁面訪問統計TOP10 ===");
            System.out.println("日期\t\t頁面URL\t\t訪問量\t獨立用户\t人均訪問");
            System.out.println("------------------------------------------------------------");
            
            while (resultSet.next()) {
                String statDate = resultSet.getString("stat_date");
                String pageUrl = resultSet.getString("page_url");
                long pvCount = resultSet.getLong("pv_count");
                long uvCount = resultSet.getLong("uv_count");
                double avgPv = resultSet.getDouble("avg_pv_per_user");
                
                System.out.printf("%s\t%s\t%d\t%d\t%.2f%n", 
                    statDate, pageUrl, pvCount, uvCount, avgPv);
            }
		}catch(Exception e){
			e.printStackTrace();
		}finally{
			// 關閉資源
            try {
                if (resultSet != null) resultSet.close();
                if (statement != null) statement.close();
                if (connection != null) connection.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
		}
	}
}

六、Shell 腳本調度

每日統計任務腳本

#!/bin/bash

# Hive 日統計任務腳本
# 使用方法: ./daily_pageview_stats.sh 2024-01-01

# 參數檢查
if [ $# -ne 1 ]; then
    echo "Usage: $0 <date>"
    echo "Example: $0 2024-01-01"
    exit 1
fi

STAT_DATE=$1
STAT_MONTH=$(date -d "$STAT_DATE" +%Y-%m)

echo "開始執行頁面訪問統計任務,日期: $STAT_DATE"

# 執行 Hive SQL
hive -e "
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;

-- 日統計任務
INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='$STAT_MONTH')
SELECT 
    dt as stat_date,
    page_url,
    COUNT(*) as pv_count,
    COUNT(DISTINCT user_id) as uv_count,
    ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
    0 as peak_hour,
    CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
  AND event_type = 'page_view'
GROUP BY dt, page_url;

-- 小時統計任務  
INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='$STAT_DATE')
SELECT 
    CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') as stat_hour,
    page_url,
    COUNT(*) as pv_count,
    COUNT(DISTINCT user_id) as uv_count,
    CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
  AND event_type = 'page_view'
GROUP BY DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), page_url;
"

# 檢查執行結果
if [ $? -eq 0 ]; then
    echo "頁面訪問統計任務執行成功"
    
    # 發送通知(可選)
    # send_notification "PageView Stats" "Daily stats completed for $STAT_DATE"
else
    echo "頁面訪問統計任務執行失敗"
    exit 1
fi

七、數據導出到 MySQL

Hive 到 MySQL 數據導出

-- 創建 Hive 外部表指向 MySQL(需要安裝 Hive MySQL connector)
CREATE EXTERNAL TABLE page_view_stats_mysql (
    stat_date STRING,
    page_url STRING,
    pv_count BIGINT,
    uv_count BIGINT,
    avg_pv_per_user DOUBLE,
    peak_hour INT,
    create_time TIMESTAMP
)
STORED BY 'org.apache.hadoop.hive.mysql.storagehandler.MySQLStorageHandler'
TBLPROPERTIES (
    "mysql.host" = "localhost",
    "mysql.port" = "3306",
    "mysql.database" = "analytics_db",
    "mysql.table" = "page_view_stats",
    "mysql.user" = "root",
    "mysql.password" = "password"
);

-- 導出數據到 MySQL
INSERT OVERWRITE TABLE page_view_stats_mysql
SELECT 
    stat_date,
    page_url,
    pv_count,
    uv_count,
    avg_pv_per_user,
    peak_hour,
    create_time
FROM page_view_daily_stats
WHERE stat_month = '2024-01';

八、性能優化配置

Hive 調優參數

-- 在 Hive SQL 開頭設置優化參數
SET hive.exec.parallel = true;
SET hive.exec.parallel.thread.number = 8;
SET hive.auto.convert.join = true;
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles = true;
SET hive.merge.size.per.task = 256000000;
SET hive.merge.smallfiles.avgsize = 128000000;
SET hive.exec.compress.output = true;
SET mapred.output.compression.codec = org.apache.hadoop.io.compress.SnappyCodec;

運行步驟

  1. 準備 Hadoop 環境:確保 HDFS 和 Hive 服務正常運行
  2. 創建 Hive 表:執行上述表創建 SQL
  3. 導入數據:將日誌文件上傳到 HDFS 並加載到 Hive
  4. 執行分析任務:運行 Hive SQL 或 Java 程序
  5. 查看結果:在 Hive 或 MySQL 中查詢統計結果