CDH5.4.5运行多字符分割记录

时间:2022-04-11 07:17:42

准备工作:

测试文件内容:cis_cust_imp_info

20131131|+|100010001001|+|BR01|+|2000.01
20131131|+|100010001002|+|BR01|+|2000.02
20131131|+|100010001003|+|BR02|+|2000.03

将测试文件放入HDFS:

sudo -u hdfs hdfs dfs -mkdir -p /ceb/cis_cust_imp_info
sudo -u hdfs hdfs dfs -chmod 777 /ceb/cis_cust_imp_info
hdfs dfs -put cis_cust_imp_info /ceb/cis_cust_imp_info

在Hive中输入下面的命令:

add jar /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib-1.1.0-cdh5.4.5.jar;
add jar /root/ceb/HiveMultiDelimiter.jar;

DROP TABLE IF EXISTS cis_cust_imp_info_ext;
CREATE EXTERNAL TABLE IF NOT EXISTS cis_cust_imp_info_ext(statistics_dt string,cust_id string,open_org_id string,assert9_bal string)
ROW FORMAT SERDE 'com.cloudera.hadoop.hive.contrib.serde2.alter.MultiDelimitSerDe' WITH SERDEPROPERTIES ('input.delimited'='\|\+\|','field.encoding'='utf-8')
LOCATION '/ceb/cis_cust_imp_info';

select * from cis_cust_imp_info_ext;

--hive:将多字符分割变成impala/phoenix能处理的txt
create table cis_cust_imp_info(cust_id string,statistics_dt string,assert9_bal double)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
insert overwrite table cis_cust_imp_info select cust_id, statistics_dt, assert9_bal from cis_cust_imp_info_ext;