Spark性能调优案例-多表join优化,减少shuffle
// 从tableA读取一次数据,放到临时表t1
DROP TABLE IF EXISTS temp.tmp_xx_$date_1;
CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_1
as
select
xxx
from tableA
where xxxx;
// 从临时表t1读取和转换数据,得临时表t2
DROP TABLE IF EXISTS temp.tmp_xx_$date_2;
CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_2
as
SELECT
device_id
,c2
FROM
(
SELECT
device_id
,c2
,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num
FROM
(
select
device_id
,c2
from temp.tmp_xx_$date_1
where !isempty(c2)
)t
)t
WHERE t.num=1;
// 从临时表t1读取和转换数据,得临时表t3
DROP TABLE IF EXISTS temp.tmp_xx_$date_3;
CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_3
as
SELECT
device_id
,c3
FROM
(
SELECT
device_id
,c3
,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num
FROM
(
select
device_id
,c3
from temp.tmp_xx_$date_1
where !isempty(c3)
)t
)t
WHERE t.num=1;
// 从临时表t1读取和转换数据,得临时表t4
DROP TABLE IF EXISTS temp.tmp_xx_$date_4;
CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_4
as
SELECT
device_id
,c4
FROM
(
SELECT
device_id
,c6
,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num
FROM
(
select
device_id
,c4
from temp.tmp_xx_$date_1
where !isempty(c4)
)t
)t
WHERE t.num=1
...
// 从临时表t1读取和转换数据,得临时表t7
DROP TABLE IF EXISTS temp.tmp_xx_$date_7;
CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_7
as
SELECT
device_id
,c7
FROM
(
SELECT
device_id
,c7
,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num
FROM
(
select
device_id
,c7
from temp.tmp_xx_$date_1
where !isempty(c7)
)t
)t
WHERE t.num=1
//t2关联t3-t7结果写入最终结果表
INSERT OVERWRITE TABLE biads.xxxx
PARTITION (pt_d='$date')
select
xxx
from temp.tmp_xx_$date_2
left join temp.tmp_xx_$date_3
left join temp.tmp_xx_$date_4
...
left join temp.tmp_xx_$date_7