ClickHouse复杂查询单表亿级数据案例(可导出Excel)

时间:2024-09-30 17:03:59

通过本篇博客,读者可以了解到如何在 ClickHouse 中高效地创建和管理大规模销售数据。随机数据生成和复杂查询的示例展示了 ClickHouse 的强大性能和灵活性。掌握这些技能后,用户能够更好地进行数据分析和决策支持,提升业务洞察能力。

表结构准备

销售表

CREATE TABLE IF NOT EXISTS sales (
    id Int64,
    product_id Int64,
    quantity Int32,
    price Float64,
    timestamp DateTime,
    customer_id Int64,          -- 客户ID
    discount Float64 DEFAULT 0, -- 折扣
    total_amount Float64,       -- 总金额
    payment_method String,      -- 付款方式
    status String,              -- 订单状态
    shipping_address String,    -- 发货地址
    billing_address String,     -- 账单地址
    order_notes String,         -- 订单备注
    created_at DateTime,        -- 创建时间
    updated_at DateTime,        -- 更新时间
    shipping_cost Float64,      -- 运费
    tax Float64,                -- 税费
    order_source String,        -- 订单来源
    fulfillment_status String,   -- 履行状态
    product_name String,        -- 产品名称
    product_category String      -- 产品类别
) ENGINE = MergeTree()
ORDER BY timestamp;

数据准备

随机生成1亿数据(我用的2千万一次一次插入)

INSERT INTO sales (id, product_id, quantity, price, timestamp, customer_id, discount, total_amount, payment_method, status, shipping_address, billing_address, order_notes, created_at, updated_at, shipping_cost, tax, order_source, fulfillment_status, product_name, product_category) SELECT
    number AS id,
    rand() % 10000 AS product_id,
    round((rand() % 20) + 5) AS quantity,
    round((rand() % 1000) + 100, 2) AS price,
    now() - toIntervalDay(rand() % 3650) AS timestamp,
    rand() % 1000 AS customer_id,
    round(rand() % 50, 2) AS discount,
    round((quantity * price) * (1 - (discount / 100)), 2) AS total_amount,
    ['credit_card', 'paypal', 'bank_transfer', 'cash', 'gift_card'][(rand() % 5) + 1] AS payment_method,
    ['pending', 'completed', 'canceled', 'refunded'][(rand() % 4) + 1] AS status,
    concat('Shipping Address ', number) AS shipping_address,
    concat('Billing Address ', number) AS billing_address,
    concat('Order notes for order ', number) AS order_notes,
    now() - toIntervalDay(rand() % 3650) AS created_at,
    now() - toIntervalDay(rand() % 3650) AS updated_at,
    round((rand() % 100) + 10, 2) AS shipping_cost,
    round((rand() % 50) + 5, 2) AS tax,
    ['website', 'mobile_app', 'third_party'][(rand() % 3) + 1] AS order_source,
    ['not_fulfilled', 'fulfilled', 'partially_fulfilled'][(rand() % 3) + 1] AS fulfillment_status,
    concat('Product Name ', number) AS product_name,
    ['electronics', 'clothing', 'home', 'toys', 'books'][(rand() % 5) + 1] AS product_category
FROM numbers(20000000)

Query id: 1237aa3d-3596-4d76-ac1b-cd4854eaa7bd

↖ Progress: 17.40 million rows, 139.19 MB (1.44 million rows/s., 11.48 MB/s.) (1.0 CPU, 415.10 MB RAM)███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       86%

在生成数据时,我们可以看到CPU的占用率已经大于单核

 PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND                                                                                                                                                                              
 114260 clickho+  20   0   10.4g   1.7g 339104 S 140.9  11.4   3:36.99 clickhouse-serv

执行插入完毕后,查询当前数据条数

select count(*) from sales

SELECT count(*)
FROM sales

Query id: aeb5a6f3-9776-4220-8f1a-abf1e5855943

   ┌───count()─┐
1. │ 100101000 │ -- 100.10 million
   └───────────┘

1 row in set. Elapsed: 0.001 sec.

较为复杂的指标查询语句

SELECT
    product_id,  -- 产品 ID
    SUM(quantity) AS total_quantity,  -- 总销售数量
    SUM(quantity * price) AS total_sales,  -- 总销售额
    AVG(price) AS average_price,  -- 平均价格
    COUNT(*) AS total_transactions,  -- 总交易次数
    MAX(price) AS max_price,  -- 最高价格
    MIN(price) AS min_price,  -- 最低价格
    SUM(quantity) / NULLIF(COUNT(*), 0) AS avg_quantity_per_transaction_count,  -- 每笔交易的平均数量
    SUM(quantity * price) / NULLIF(SUM(quantity), 0) AS avg_sales_per_unit,  -- 每单位的平均销售额
    SUM(quantity) / NULLIF(COUNT(DISTINCT timestamp), 0) AS avg_daily_sales,  -- 平均每日销售数量
    COUNT(DISTINCT timestamp) AS selling_days,  -- 销售天数
    SUM(quantity * price) / NULLIF(COUNT(DISTINCT timestamp), 0) AS avg_daily_sales_value,  -- 平均每日销售额
    SUM(quantity) / SUM(quantity * price) AS sales_conversion_rate,  -- 销售转化率
    COUNT(IF(price > 50, 1, NULL)) AS high_price_transactions,  -- 高价交易次数(价格 > 50)
    COUNT(IF(price <= 50, 1, NULL)) AS low_price_transactions,  -- 低价交易次数(价格 <= 50)
    SUM(quantity * price) / NULLIF(SUM(quantity), 0) AS sales_price,  -- 销售价格
    (SUM(quantity) / SUM(quantity * price)) * 100 AS sales_contribution_rate,  -- 销售贡献率
    COUNT(IF(quantity > 10, 1, NULL)) AS bulk_sales_transactions,  -- 大宗销售交易次数(数量 > 10)
    SUM(IF(timestamp >= (NOW() - INTERVAL 30 DAY), quantity, 0)) AS recent_sales_quantity,  -- 最近30天的销售数量
    SUM(IF(timestamp >= (NOW() - INTERVAL 30 DAY), quantity * price, 0)) AS recent_sales_value,  -- 最近30天的销售额
    AVG(IF(price > 50, price, NULL)) AS avg_high_price,  -- 高价商品的平均价格
    AVG(IF(price <= 50, price, NULL)) AS avg_low_price,  -- 低价商品的平均价格
    SUM(quantity) / NULLIF(COUNT(DISTINCT timestamp), 0) AS avg_quantity_per_day,  -- 每天的平均销售数量
    COUNT(IF(timestamp >= (NOW() - INTERVAL 7 DAY), 1, NULL)) AS recent_transactions,  -- 最近7天的交易次数
    SUM(IF(timestamp >= (NOW() - INTERVAL 7 DAY), quantity, 0)) AS recent_week_sales_quantity,  -- 最近7天的销售数量
    SUM(IF(timestamp >= (NOW() - INTERVAL 7 DAY), quantity * price, 0)) AS recent_week_sales_value,  -- 最近7天的销售额
    SUM(IF(price < 20, quantity, 0)) AS low_price_sales_quantity,  -- 低价销售数量(价格 < 20)
    SUM(IF((price >= 20) AND (price <= 50), quantity, 0)) AS mid_price_sales_quantity,  -- 中价销售数量(20 <= 价格 <= 50)
    SUM(IF(price > 50, quantity, 0)) AS high_price_sales_quantity,  -- 高价销售数量(价格 > 50)
    COUNT(IF(quantity > 1, 1, NULL)) AS multiple_items_transactions,  -- 多件商品交易次数(数量 > 1)
    COUNT(IF(price IS NULL, 1, NULL)) AS missing_price_transactions,  -- 缺失价格的交易次数
    SUM(IF(price IS NOT NULL, quantity * price, 0)) AS sales_with_price,  -- 有价格的销售额
    SUM(IF(price > 100, quantity, 0)) AS high_value_sales_quantity,  -- 高价值销售数量(价格 > 100)
    SUM(IF((price >= 20) AND (price <= 100), quantity, 0)) AS mid_value_sales_quantity,  -- 中价值销售数量(20 <= 价格 <= 100)
    SUM(IF(price < 20, quantity, 0)) AS low_value_sales_quantity,  -- 低价值销售数量(价格 < 20)
    COUNT(IF(quantity > 5, 1, NULL)) AS frequent_buyers,  -- 频繁购买者(数量 > 5)
    SUM(IF(timestamp >= (NOW() - INTERVAL 1 YEAR), quantity, 0)) AS yearly_sales_quantity,  -- 年度销售数量
    SUM(IF(timestamp >= (NOW() - INTERVAL 1 YEAR), quantity * price, 0)) AS yearly_sales_value  -- 年度销售额
FROM sales
GROUP BY product_id  -- 按产品 ID 分组
ORDER BY total_sales DESC;  -- 按总销售额降序排列

查询结果占用资源情况:

10000 rows in set. Elapsed: 2.439 sec. Processed 100.10 million rows, 2.40 GB (41.05 million rows/s., 985.12 MB/s.)
Peak memory usage: 93.66 MiB.

这个查询结果的输出信息包含了几个关键部分,下面逐一解释:

  1. 10000 rows in set

  • 这表示查询结果中返回了 10,000 行数据。这个数字是查询所处理的结果集的行数。
  1. Elapsed: 2.439 sec

  • 这是查询执行的总时间,表示从开始到结束所花费的时间为 2.439 秒。
  1. Processed 100.10 million rows, 2.40 GB

  • 这表示在执行查询时,数据库系统处理了 1 亿 10 万行数据,总共读取了 2.40 GB 的数据。这意味着虽然最终只返回了 10,000 行,但在计算这些结果时,数据库需要扫描大量的数据。
  1. (41.05 million rows/s., 985.12 MB/s.)

  • 这部分提供了处理速度的信息:
    • 985.12 MB/s.:表示数据读取的速度为每秒 985.12 MB。

    • 41.05 million rows/s.:表示查询处理的速度为每秒 4105 万行。

  1. Peak memory usage: 93.66 MiB

  • 这是查询执行过程中使用的最大内存量,表示查询的峰值内存使用为 93.66 MiB。

导出到Excel表格并附加表头

SELECT
    product_id,  -- 产品 ID
    SUM(quantity) AS total_quantity,  -- 总销售数量
    SUM(quantity * price) AS total_sales,  -- 总销售额
    AVG(price) AS average_price,  -- 平均价格
    COUNT(*) AS total_transactions,  -- 总交易次数
    MAX(price) AS max_price,  -- 最高价格
    MIN(price) AS min_price,  -- 最低价格
    SUM(quantity) / NULLIF(COUNT(*), 0) AS avg_quantity_per_transaction_count,  -- 每笔交易的平均数量
    SUM(quantity * price) / NULLIF(SUM(quantity), 0) AS avg_sales_per_unit,  -- 每单位的平均销售额
    SUM(quantity) / NULLIF(COUNT(DISTINCT timestamp), 0) AS avg_daily_sales,  -- 平均每日销售数量
    COUNT(DISTINCT timestamp) AS selling_days,  -- 销售天数
    SUM(quantity * price) / NULLIF(COUNT(DISTINCT timestamp), 0) AS avg_daily_sales_value,  -- 平均每日销售额
    SUM(quantity) / SUM(quantity * price) AS sales_conversion_rate,  -- 销售转化率
    COUNT(IF(price > 50, 1, NULL)) AS high_price_transactions,  -- 高价交易次数(价格 > 50)
    COUNT(IF(price <= 50, 1, NULL)) AS low_price_transactions,  -- 低价交易次数(价格 <= 50)
    SUM(quantity * price) / NULLIF(SUM(quantity), 0) AS sales_price,  -- 销售价格
    (SUM(quantity) / SUM(quantity * price)) * 100 AS sales_contribution_rate,  -- 销售贡献率
    COUNT(IF(quantity > 10, 1, NULL)) AS bulk_sales_transactions,  -- 大宗销售交易次数(数量 > 10)
    SUM(IF(timestamp >= (NOW() - INTERVAL 30 DAY), quantity, 0)) AS recent_sales_quantity,  -- 最近30天的销售数量
    SUM(IF(timestamp >= (NOW() - INTERVAL 30 DAY), quantity * price, 0)) AS recent_sales_value,  -- 最近30天的销售额
    AVG(IF(price > 50, price, NULL)) AS avg_high_price,  -- 高价商品的平均价格
    AVG(IF(price <= 50, price, NULL)) AS avg_low_price,  -- 低价商品的平均价格
    SUM(quantity) / NULLIF(COUNT(DISTINCT timestamp), 0) AS avg_quantity_per_day,  -- 每天的平均销售数量
    COUNT(IF(timestamp >= (NOW() - INTERVAL 7 DAY), 1, NULL)) AS recent_transactions,  -- 最近7天的交易次数
    SUM(IF(timestamp >= (NOW() - INTERVAL 7 DAY), quantity, 0)) AS recent_week_sales_quantity,  -- 最近7天的销售数量
    SUM(IF(timestamp >= (NOW() - INTERVAL 7 DAY), quantity * price, 0)) AS recent_week_sales_value,  -- 最近7天的销售额
    SUM(IF(price < 20, quantity, 0)) AS low_price_sales_quantity,  -- 低价销售数量(价格 < 20)
    SUM(IF((price >= 20) AND (price <= 50), quantity, 0)) AS mid_price_sales_quantity,  -- 中价销售数量(20 <= 价格 <= 50)
    SUM(IF(price > 50, quantity, 0)) AS high_price_sales_quantity,  -- 高价销售数量(价格 > 50)
    COUNT(IF(quantity > 1, 1, NULL)) AS multiple_items_transactions,  -- 多件商品交易次数(数量 > 1)
    COUNT(IF(price IS NULL, 1, NULL)) AS missing_price_transactions,  -- 缺失价格的交易次数
    SUM(IF(price IS NOT NULL, quantity * price, 0)) AS sales_with_price,  -- 有价格的销售额
    SUM(IF(price > 100, quantity, 0)) AS high_value_sales_quantity,  -- 高价值销售数量(价格 > 100)
    SUM(IF((price >= 20) AND (price <= 100), quantity, 0)) AS mid_value_sales_quantity,  -- 中价值销售数量(20 <= 价格 <= 100)
    SUM(IF(price < 20, quantity, 0)) AS low_value_sales_quantity,  -- 低价值销售数量(价格 < 20)
    COUNT(IF(quantity > 5, 1, NULL)) AS frequent_buyers,  -- 频繁购买者(数量 > 5)
    SUM(IF(timestamp >= (NOW() - INTERVAL 1 YEAR), quantity, 0)) AS yearly_sales_quantity,  -- 年度销售数量
    SUM(IF(timestamp >= (NOW() - INTERVAL 1 YEAR), quantity * price, 0)) AS yearly_sales_value,  -- 年度销售额
    AVG(IF(price IS NOT NULL, price, NULL)) AS avg_price,  -- 平均价格(排除 NULL)
    COUNT(DISTINCT customer_id) AS unique_customers,  -- 唯一客户数量
    COUNT(IF(quantity = 0, 1, NULL)) AS zero_quantity_sales,  -- 销售数量为零的交易次数
    SUM(IF(price IS NOT NULL AND quantity > 0, quantity * price, 0)) AS valid_sales_value  -- 有效销售额(价格不为 NULL 且数量 > 0)
FROM sales
GROUP BY product_id  -- 按产品 ID 分组
ORDER BY total_sales DESC  -- 按总销售额降序排列
INTO OUTFILE '/test1.csv'  -- 输出到 CSV 文件
FORMAT CSVWithNames;  -- CSV 格式包含列名

然后下载到Windows系统打开即可