博客 / 詳情

返回

數據模擬1w分區,每個分區2條數據

#!/bin/bash

# 生成修正後的SQL文件
cat > hive2dlc_single_partition1.sql << 'EOF'
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=10000;
SET hive.exec.max.dynamic.partitions.pernode=10000;
SET spark.sql.shuffle.partitions=50;

CREATE DATABASE IF NOT EXISTS journey;

CREATE TABLE IF NOT EXISTS journey.hive_single_partition (
    shop_name STRING,
    customer_id INT
)
PARTITIONED BY (dt STRING);

INSERT OVERWRITE TABLE journey.hive_single_partition PARTITION (dt)
SELECT
  CONCAT('shop', CAST(partition_id AS STRING)) AS shop_name,
  CAST(partition_id * 100 + (id % 2) AS INT) AS customer_id,
  DATE_FORMAT(DATE_ADD('2000-01-01', CAST(partition_id AS INT)), 'yyyyMMdd') AS dt
FROM (
  SELECT 
    id,
    CAST(FLOOR(id / 2) AS INT) AS partition_id
  FROM (
    SELECT explode(sequence(0, 19999)) AS id
  )
) generated_data;
EOF

# 執行Spark SQL
nohup spark-sql --master yarn -f hive_single_partition.sql &


ANALYZE TABLE hive_single_partition PARTITION(dt) COMPUTE STATISTICS;
user avatar edagarli 頭像 jellyfishmix 頭像
2 位用戶收藏了這個故事!

發佈 評論

Some HTML is okay.