Hive中distinct和Group by效率对比及处理方式

时间:2022-01-12 04:30:12

结论:

针对大量数据的去重,group by的效率要远高于distinct。

从distinct转化为group by

简单的转化这里不提,一条语句里求总记录条数以及去重之后的记录条数
简单示例:
Hive中distinct和Group by效率对比及处理方式

SELECT
COUNT(*) AS sum_cnt
,COUNT(DISTINCT age ) AS user_cnt
FROM
liu_t_02 t
GROUP BY
t.name;

结果
Hive中distinct和Group by效率对比及处理方式

转化为group by 只需要在加一个group by,并采用sum和count的组合

SELECT
SUM(age),
COUNT(tt.age)
FROM
(
SELECT t.name0 AS name0,COUNT(age) as age
FROM
liu_t_02 t
GROUP BY
t.name0,t.age
)tt
GROUP BY tt.name0

测试时间对比

自己写的用于计算巨量数据的distinct的脚本,在shell中用时13分钟

select
rst.flag AS flag
,rst.source AS source
,rst.template AS template
,count(*) AS click_cnt
,count(distinct rst.imei) AS click_user
,rst.prod_name AS prod_name
from
(
SELECT
ttt.flag AS flag
,ttt.source AS source
,ttt.template AS template
,tttt.prod_name AS prod_name
,tttt.imei AS imei
FROM
(
SELECT
t2.flag AS flag
,t2.source AS source
,t2.template AS template
,imei
FROM
(
SELECT
t1.pt_d
,app_ver
,t1.cardId
,t1.template
,t1.source
,1 AS flag
,imei AS imei
FROM
(
SELECT
t.imei
,t.app_ver AS app_ver
,get_json_object(t.content,'$.cardId') AS cardId
,get_json_object(t.content,'$.template') AS template
,IF(t.content rlike 'content',get_json_object(t.content,'$.content'),'') AS source
,t.pt_d AS pt_d
FROM
(
SELECT
imei
,report_evt_content AS content
,app_ver
,pt_d
FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm
WHERE pt_d='20170722'
AND pt_service='hiboard'
AND package_name='com.huawei.hiboard'
AND app_ver rlike '5*'
AND report_evt_id='65539'
)t
)t1
WHERE t1.app_ver>='5.0.1.312'
AND cardid='4'
)t2

UNION ALL

SELECT
tt3.flag AS flag
,tt3.recreason AS source
,tt3.template AS template
,imei
FROM
(
SELECT
tt1.flag AS flag
,tt1.recreason AS recreason
,tt1.template AS template
,tt1.pt_d AS pt_d
,tt1.imei AS imei
FROM
(
SELECT
tt.re_time
,tt.times
,tt.imei
,tt.version
,tt.phonetype
,get_json_object(a.apps,'$.template') AS template
,get_json_object(a.apps,'$.cardId') AS cardId
,IF(a.apps rlike 'recReason',get_json_object(a.apps,'$.recReason'),'') AS recreason
,2 AS flag
,tt.pt_d AS pt_d
FROM
(
SELECT
split(message,'\\|')[0] AS re_time
,split(message,'\\|')[1] AS times
,bicoredata.HiboardAesDecrypt(split(message,'\\|')[2]) AS imei
,split(message,'\\|')[3] AS cardId
,split(message,'\\|')[4] AS version
,split(message,'\\|')[5] AS phonetype
,split(message,'\\|')[6] AS showlist
,pt_d
FROM
(
SELECT
get_json_object(meassage,'$.message') AS message
,get_json_object(meassage,'$.\\\\@logType') AS logType
,get_json_object(meassage,'$.\\@hostAddr') AS hostAddr
,get_json_object(meassage,'$.\\@hostName') AS hostName
,pt_d
FROM biads.ads_rcm_hiboard_server_shows_operlog_dm
WHERE pt_d='20170722'
)tt0
)tt
LATERAL VIEW EXPLODE(split(regexp_replace(SUBSTR(tt.showlist,2,LENGTH(tt.showlist)-2),'\\},\\{','\\}#\\{'),'#')) a AS apps
)tt1

JOIN

(
SELECT
imei
FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm
WHERE pt_d='20170722'
AND pt_service='hiboard'
AND package_name='com.huawei.hiboard'
AND app_ver rlike '5*'
AND report_evt_id='65537'
AND rec_time rlike '2017-07-22'
)tt2
ON tt1.imei = tt2.imei
WHERE tt1.cardid='4'
AND tt1.version>='5.0.1.312'
)tt3
)ttt

LEFT OUTER JOIN

(
SELECT
UPPER(prod_name) AS prod_name
,imei
FROM dwd_eqp_device_ds_his
)tttt
ON ttt.imei = tttt.imei

)rst
group by rst.flag,
rst.source,
rst.template,
rst.prod_name limit 50;

修改为group by,运算结果相同,用时9分钟,效率提升30%。


select
res.flag AS flag
,res.source AS source
,res.template AS template
,SUM(res.click_user) AS click_cnt
,count(res.click_user) AS click_user
,res.prod_name AS prod_name
from
(
select
rst.flag AS flag
,rst.source AS source
,rst.template AS template
,count(rst.imei) AS click_user
,rst.prod_name AS prod_name
from
(
SELECT
ttt.flag AS flag
,ttt.source AS source
,ttt.template AS template
,tttt.prod_name AS prod_name
,tttt.imei AS imei
FROM
(
SELECT
t2.flag AS flag
,t2.source AS source
,t2.template AS template
,imei
FROM
(
SELECT
t1.pt_d
,app_ver
,t1.cardId
,t1.template
,t1.source
,1 AS flag
,imei AS imei
FROM
(
SELECT
t.imei
,t.app_ver AS app_ver
,get_json_object(t.content,'$.cardId') AS cardId
,get_json_object(t.content,'$.template') AS template
,IF(t.content rlike 'content',get_json_object(t.content,'$.content'),'') AS source
,t.pt_d AS pt_d
FROM
(
SELECT
imei
,report_evt_content AS content
,app_ver
,pt_d
FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm
WHERE pt_d='20170722'
AND pt_service='hiboard'
AND package_name='com.huawei.hiboard'
AND app_ver rlike '5*'
AND report_evt_id='65539'
)t
)t1
WHERE t1.app_ver>='5.0.1.312'
AND cardid='4'
)t2

UNION ALL

SELECT
tt3.flag AS flag
,tt3.recreason AS source
,tt3.template AS template
,imei
FROM
(
SELECT
tt1.flag AS flag
,tt1.recreason AS recreason
,tt1.template AS template
,tt1.pt_d AS pt_d
,tt1.imei AS imei
FROM
(
SELECT
tt.re_time
,tt.times
,tt.imei
,tt.version
,tt.phonetype
,get_json_object(a.apps,'$.template') AS template
,get_json_object(a.apps,'$.cardId') AS cardId
,IF(a.apps rlike 'recReason',get_json_object(a.apps,'$.recReason'),'') AS recreason
,2 AS flag
,tt.pt_d AS pt_d
FROM
(
SELECT
split(message,'\\|')[0] AS re_time
,split(message,'\\|')[1] AS times
,bicoredata.HiboardAesDecrypt(split(message,'\\|')[2]) AS imei
,split(message,'\\|')[3] AS cardId
,split(message,'\\|')[4] AS version
,split(message,'\\|')[5] AS phonetype
,split(message,'\\|')[6] AS showlist
,pt_d
FROM
(
SELECT
get_json_object(meassage,'$.message') AS message
,get_json_object(meassage,'$.\\\\@logType') AS logType
,get_json_object(meassage,'$.\\@hostAddr') AS hostAddr
,get_json_object(meassage,'$.\\@hostName') AS hostName
,pt_d
FROM biads.ads_rcm_hiboard_server_shows_operlog_dm
WHERE pt_d='20170722'
)tt0
)tt
LATERAL VIEW EXPLODE(split(regexp_replace(SUBSTR(tt.showlist,2,LENGTH(tt.showlist)-2),'\\},\\{','\\}#\\{'),'#')) a AS apps
)tt1

JOIN

(
SELECT
imei
FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm
WHERE pt_d='20170722'
AND pt_service='hiboard'
AND package_name='com.huawei.hiboard'
AND app_ver rlike '5*'
AND report_evt_id='65537'
AND rec_time rlike '2017-07-22'
)tt2
ON tt1.imei = tt2.imei
WHERE tt1.cardid='4'
AND tt1.version>='5.0.1.312'
)tt3
)ttt

LEFT OUTER JOIN

(
SELECT
UPPER(prod_name) AS prod_name
,imei
FROM dwd_eqp_device_ds_his
)tttt
ON ttt.imei = tttt.imei

)rst
group by rst.flag,
rst.source,
rst.template,
rst.prod_name,
rst.imei
)res
group by res.flag,
res.source,
res.template,
res.prod_name limit 50;