Spark性能调优案例-多表join优化,减少shuffle

时间:2025-04-03 20:58:22
// 从tableA读取一次数据,放到临时表t1 DROP TABLE IF EXISTS temp.tmp_xx_$date_1; CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_1 as select xxx from tableA where xxxx; // 从临时表t1读取和转换数据,得临时表t2 DROP TABLE IF EXISTS temp.tmp_xx_$date_2; CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_2 as SELECT device_id ,c2 FROM ( SELECT device_id ,c2 ,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num FROM ( select device_id ,c2 from temp.tmp_xx_$date_1 where !isempty(c2) )t )t WHERE t.num=1; // 从临时表t1读取和转换数据,得临时表t3 DROP TABLE IF EXISTS temp.tmp_xx_$date_3; CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_3 as SELECT device_id ,c3 FROM ( SELECT device_id ,c3 ,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num FROM ( select device_id ,c3 from temp.tmp_xx_$date_1 where !isempty(c3) )t )t WHERE t.num=1; // 从临时表t1读取和转换数据,得临时表t4 DROP TABLE IF EXISTS temp.tmp_xx_$date_4; CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_4 as SELECT device_id ,c4 FROM ( SELECT device_id ,c6 ,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num FROM ( select device_id ,c4 from temp.tmp_xx_$date_1 where !isempty(c4) )t )t WHERE t.num=1 ... // 从临时表t1读取和转换数据,得临时表t7 DROP TABLE IF EXISTS temp.tmp_xx_$date_7; CREATE TABLE IF NOT EXISTS temp.tmp_xxx_$date_7 as SELECT device_id ,c7 FROM ( SELECT device_id ,c7 ,ROW_NUMBER() OVER (PARTITION BY device_id ORDER BY time DESC) AS num FROM ( select device_id ,c7 from temp.tmp_xx_$date_1 where !isempty(c7) )t )t WHERE t.num=1 //t2关联t3-t7结果写入最终结果表 INSERT OVERWRITE TABLE biads.xxxx PARTITION (pt_d='$date') select xxx from temp.tmp_xx_$date_2 left join temp.tmp_xx_$date_3 left join temp.tmp_xx_$date_4 ... left join temp.tmp_xx_$date_7