目錄
實現步驟:
1、Hive數倉建模
2、執行Hive建表腳本
3、業務分析
①基礎驗證腳本
②業務分析腳本
4、提交到git倉庫(對應自己的文件)
案例小結
實現步驟:
1、Hive數倉建模
在scripts目錄創建 HQL 腳本,定義兩層表結構,直接關聯 HDFS 清洗後的數據:
-- 1. 數倉數據庫初始化(保留原邏輯,確保冪等性)
CREATE DATABASE IF NOT EXISTS ecommerce_dw;
USE ecommerce_dw;
-- 2. ODS層:原始數據層
-- 字段名與CSV完全一致,類型統一為STRING(保留原始格式,避免轉換失敗)
CREATE EXTERNAL TABLE IF NOT EXISTS ods_ecommerce_transactions (
Transaction_ID STRING COMMENT '交易ID(原始格式,Spark清洗前為字符串)',
User_Name STRING COMMENT '用户名(用户唯一標識,CSV原始字段)',
Age STRING COMMENT '用户年齡(原始格式,Spark後續轉為INT)',
Country STRING COMMENT '交易國家(與CSV一致)',
Product_Category STRING COMMENT '商品品類(原始格式)',
Purchase_Amount STRING COMMENT '交易金額(原始格式,Spark後續轉為DECIMAL)',
Payment_Method STRING COMMENT '支付方式(原始格式)',
Transaction_Date STRING COMMENT '交易日期(原始格式:yyyy/M/d)'
)
COMMENT '跨境電商交易原始數據ODS層(與CSV字段1:1對應)'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/raw_data/transactions/'
TBLPROPERTIES (
'skip.header.line.count' = '1',
'serialization.encoding' = 'UTF-8'
);
-- 3. DWD層:明細數據層(100%匹配Spark清洗結果)
CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ecommerce_transactions (
Transaction_ID INT COMMENT '交易ID(清洗後去重去空)',
User_Name STRING COMMENT '用户名(用户唯一標識)',
Age INT COMMENT '用户年齡(18-100歲,過濾異常值)',
Country STRING COMMENT '交易國家',
Product_Category STRING COMMENT '商品品類',
Purchase_Amount DECIMAL(10,2) COMMENT '交易金額(保留2位小數,單位:默認美元)',
Payment_Method STRING COMMENT '支付方式',
Transaction_Date DATE COMMENT '交易日期(yyyy-MM-dd)'
)
COMMENT '跨境電商交易清洗明細DWD層(與Spark輸出對齊)'
PARTITIONED BY (dt DATE COMMENT '分區字段:交易日期(按日分區)')
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/processed_data/transactions_clean/'
TBLPROPERTIES (
'parquet.compression' = 'snappy',
'external.table.purge' = 'true'
);
-- 4. DWD層分區元數據
-- Spark按分區寫入Parquet後,Hive需刷新分區才能識別新增數據
MSCK REPAIR TABLE dwd_ecommerce_transactions;
2、執行Hive建表腳本
# 1. 確保Hive Metastore和HiveServer2已啓動(之前步驟已啓動,若未啓動則執行)
nohup hive --service metastore &
nohup hive --service hiveserver2 &
# 2. 執行HQL腳本(-f:指定腳本文件)
hive -f /home/Hadoop/ecommerce_dw_project/scripts/hive_create_table.hql
# 3. 驗證表是否創建成功(進入Hive CLI查看)
hive
USE ecommerce_dw;
SHOW TABLES; # 預期輸出:ods_ecommerce_transactions、dwd_ecommerce_transactions
DESC dwd_ecommerce_transactions; # 查看DWD表結構
SELECT * FROM dwd_ecommerce_transactions LIMIT 10; # 查看錶數據(能正常顯示則成功)
quit; # 退出Hive CLI
3、業務分析
在scripts目錄建分析腳本
①基礎驗證腳本
功能:驗證 ODS/DWD 層數據完整性、一致性,確保數據鏈路通暢。
-- Basic Verification for ODS & DWD Layers
USE ecommerce_dw;
-- ==============================================
-- ODS Layer Verification
-- ==============================================
SELECT 'ODS Layer Structure' AS verification_step;
DESCRIBE ods_ecommerce_transactions;
SELECT 'ODS Layer Core Metrics' AS verification_step;
SELECT
COUNT(*) AS total_raw_records,
COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
COUNT(DISTINCT User_Name) AS unique_users,
COUNT(DISTINCT Country) AS unique_countries,
COUNT(DISTINCT Product_Category) AS unique_categories,
MIN(Transaction_Date) AS earliest_transaction_date,
MAX(Transaction_Date) AS latest_transaction_date
FROM ods_ecommerce_transactions;
-- ==============================================
-- DWD Layer Verification
-- ==============================================
SELECT 'DWD Layer Partitions' AS verification_step;
SHOW PARTITIONS dwd_ecommerce_transactions;
SELECT 'DWD Layer Core Metrics' AS verification_step;
SELECT
COUNT(*) AS total_cleaned_records,
COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
COUNT(DISTINCT User_Name) AS unique_users,
MIN(Age) AS min_age,
MAX(Age) AS max_age,
MIN(Purchase_Amount) AS min_purchase_amount,
MAX(Purchase_Amount) AS max_purchase_amount,
COUNT(DISTINCT dt) AS transaction_days
FROM dwd_ecommerce_transactions;
-- ==============================================
-- Data Consistency (ODS vs DWD)
-- ==============================================
SELECT 'Category Distribution Comparison' AS verification_step;
SELECT
'ODS' AS data_layer,
Product_Category,
COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Product_Category
UNION ALL
SELECT
'DWD' AS data_layer,
Product_Category,
COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY Product_Category, data_layer;
SELECT 'Payment Method Distribution Comparison' AS verification_step;
SELECT
'ODS' AS data_layer,
Payment_Method,
COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Payment_Method
UNION ALL
SELECT
'DWD' AS data_layer,
Payment_Method,
COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Payment_Method
ORDER BY Payment_Method, data_layer;
-- ==============================================
-- Verification Completed
-- ==============================================
SELECT 'Basic Verification Finished' AS result;
執行:
# 用beeline執行(推薦,兼容Hive集羣)
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f basic_verification.sql > basic_verification_result.txt
# 或直接在Hive CLI中執行
hive -f basic_verification.sql > basic_verification_result.txt
②業務分析腳本
功能:分析交易規模、用户價值、商品表現、地域 & 支付偏好
-- Core Business Analysis
USE ecommerce_dw;
-- ==============================================
-- 1. Transaction Scale Analysis (Time Dimension)
-- ==============================================
SELECT '1. Monthly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS monthly_transaction_scale (
transaction_year INT,
transaction_month INT,
transaction_count INT,
total_sales DECIMAL(12,2),
average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/monthly_transaction_scale';
INSERT OVERWRITE TABLE monthly_transaction_scale
SELECT
YEAR(dt) AS transaction_year,
MONTH(dt) AS transaction_month,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), MONTH(dt)
ORDER BY transaction_year, transaction_month;
SELECT * FROM monthly_transaction_scale;
SELECT '1. Quarterly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_transaction_scale (
transaction_year INT,
transaction_quarter INT,
transaction_count INT,
total_sales DECIMAL(12,2),
average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_transaction_scale';
INSERT OVERWRITE TABLE quarterly_transaction_scale
SELECT
YEAR(dt) AS transaction_year,
QUARTER(dt) AS transaction_quarter,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), QUARTER(dt)
ORDER BY transaction_year, transaction_quarter;
SELECT * FROM quarterly_transaction_scale;
-- ==============================================
-- 2. User Analysis
-- ==============================================
SELECT '2. User Age Group Analysis' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_age_group_analysis (
age_group STRING,
user_count INT,
transaction_count INT,
total_sales DECIMAL(12,2),
average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_age_group_analysis';
INSERT OVERWRITE TABLE user_age_group_analysis
SELECT
CASE
WHEN Age BETWEEN 18 AND 25 THEN '18-25'
WHEN Age BETWEEN 26 AND 35 THEN '26-35'
WHEN Age BETWEEN 36 AND 45 THEN '36-45'
WHEN Age BETWEEN 46 AND 55 THEN '46-55'
ELSE '56+'
END AS age_group,
COUNT(DISTINCT User_Name) AS user_count,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY
CASE
WHEN Age BETWEEN 18 AND 25 THEN '18-25'
WHEN Age BETWEEN 26 AND 35 THEN '26-35'
WHEN Age BETWEEN 36 AND 45 THEN '36-45'
WHEN Age BETWEEN 46 AND 55 THEN '46-55'
ELSE '56+'
END
ORDER BY total_sales DESC;
SELECT * FROM user_age_group_analysis;
SELECT '2. User Value Segmentation' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_value_segmentation (
user_value_segment STRING,
user_count INT,
total_transactions INT,
total_spend_amount DECIMAL(12,2),
avg_user_spend DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_value_segmentation';
INSERT OVERWRITE TABLE user_value_segmentation
WITH user_transaction_count AS (
SELECT
User_Name,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_spend
FROM dwd_ecommerce_transactions
GROUP BY User_Name
)
SELECT
CASE
WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
ELSE 'Regular_Value_Below_100'
END AS user_value_segment,
COUNT(User_Name) AS user_count,
SUM(transaction_count) AS total_transactions,
SUM(total_spend) AS total_spend_amount,
AVG(total_spend) AS avg_user_spend
FROM user_transaction_count
GROUP BY
CASE
WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
ELSE 'Regular_Value_Below_100'
END
ORDER BY total_spend_amount DESC;
SELECT * FROM user_value_segmentation;
-- ==============================================
-- 3. Product Analysis
-- ==============================================
SELECT '3. Product Category Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS product_category_performance (
product_category STRING,
sales_count INT,
total_sales DECIMAL(12,2),
category_aov DECIMAL(10,2),
sales_contribution_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/product_category_performance';
INSERT OVERWRITE TABLE product_category_performance
SELECT
Product_Category,
COUNT(*) AS sales_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS category_aov,
(SUM(Purchase_Amount) / (SELECT SUM(Purchase_Amount) FROM dwd_ecommerce_transactions)) * 100 AS sales_contribution_pct
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY total_sales DESC;
SELECT * FROM product_category_performance;
SELECT '3. Quarterly Product Category Trend' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_product_trend (
product_category STRING,
year INT,
quarter INT,
sales_count INT,
quarterly_sales DECIMAL(12,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_product_trend';
INSERT OVERWRITE TABLE quarterly_product_trend
SELECT
Product_Category,
YEAR(dt) AS year,
QUARTER(dt) AS quarter,
COUNT(*) AS sales_count,
SUM(Purchase_Amount) AS quarterly_sales
FROM dwd_ecommerce_transactions
GROUP BY Product_Category, YEAR(dt), QUARTER(dt)
ORDER BY Product_Category, year, quarter;
SELECT * FROM quarterly_product_trend;
-- ==============================================
-- 4. Geography & Payment Method Analysis
-- ==============================================
SELECT '4. Country-wise Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS country_performance (
country STRING,
transaction_count INT,
total_sales DECIMAL(12,2),
country_aov DECIMAL(10,2),
active_user_count INT
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/country_performance';
INSERT OVERWRITE TABLE country_performance
SELECT
Country,
COUNT(*) AS transaction_count,
SUM(Purchase_Amount) AS total_sales,
AVG(Purchase_Amount) AS country_aov,
COUNT(DISTINCT User_Name) AS active_user_count
FROM dwd_ecommerce_transactions
GROUP BY Country
ORDER BY total_sales DESC;
SELECT * FROM country_performance;
SELECT '4. Payment Method Preference by Country' AS analysis_topic;
CREATE TABLE IF NOT EXISTS payment_preference_by_country (
country STRING,
payment_method STRING,
payment_count INT,
country_payment_share_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/payment_preference_by_country';
INSERT OVERWRITE TABLE payment_preference_by_country
SELECT
Country,
Payment_Method,
COUNT(*) AS payment_count,
(COUNT(*) / SUM(COUNT(*)) OVER (PARTITION BY Country)) * 100 AS country_payment_share_pct
FROM dwd_ecommerce_transactions
GROUP BY Country, Payment_Method
ORDER BY Country, payment_count DESC;
SELECT * FROM payment_preference_by_country;
-- ==============================================
-- Analysis Completed
-- ==============================================
SELECT 'All Business Analysis Completed' AS result;
執行:
# 執行並保存結果到文件
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f business_analysis.sql > business_analysis_result.txt
4、提交到git倉庫(對應自己的文件)
# 1. 進入項目根目錄
cd /home/Hadoop/ecommerce_dw_project/
# 2. 添加新增文件到Git暫存區
git add scripts/spark_preprocess.py
git add scripts/hive_create_table.hql
git add scripts/basic_verification.sql
git add scripts/business_analysis.sql
# 3. 提交到本地倉庫(寫清楚提交內容,方便回溯)
git commit -m "完成全流程:數據集上傳+Spark預處理+Hive建模+業務分析"
# 4. 查看提交記錄(驗證提交成功)
git log --oneline -n 5
# 預期輸出:最新一條記錄為上述commit信息
案例小結
算是非常淺地過了一遍流程。
自我反思,像這樣的流程只能算是粗略的,肯定可以有很多技術或者工具可以加速或者更好地實現這個流程。期間遇到的報錯都可以靠一些方法規避,但是因為沒有經驗卻成為了非常沒有效率地挨個找。
知識體系的不成熟,沒有指導的無頭蒼蠅,看起來就像是一場自嗨,初學階段還是需要有方向指導,否則效率低成效小。
對於案例的啓動,初心是過一遍流程,但是真正的項目是為了解決問題而來,意識到這點即可。
未來案例可以升級。