Hive-ETL开发样例 - timebusker的博客

编写HSQL脚本

-- 经纬度信息;
drop table if exists tb_test_kk_tmp;
create table tb_test_kk_tmp as 
select kkbh,jd,wd from tb_test where
udf_is_null(kkbh)=1 and udf_is_null(jd)=1 and udf_is_null(wd)=1 and
jd not like '0%' and wd not like '0%' and
ld>=2018122500 and ld<2018122600 
union all 
select t.* from tb_test_kk t;

-- 名称信息;
drop table if exists tb_test_kkmc_tmp;
create table tb_test_kkmc_tmp as 
select kkbh,kkmc from tb_test where
udf_is_null(kkbh)=1 and udf_is_null(kkmc)=1 and
ld>=2018122500 and ld<2018122600 
union all 
select t.* from tb_test_kkmc t;

-- 设置reduce任务数，避免产生大量小文件;
set mapred.reduce.tasks=1;
drop table if exists tb_test_kk;
create table tb_test_kk(
kkbh string,
jd   string,
wd   string
)row format delimited
fields terminated by ' '
lines terminated by '\n';

insert into tb_test_kk
select kkbh,jd,wd from tb_test_kk_tmp group by kkbh,jd,wd;

drop table if exists tb_test_kkmc;
create table tb_test_kkmc(
kkbh   string,
kkmc   string
)row format delimited
fields terminated by ' '
lines terminated by '\n';

insert into tb_test_kkmc 
select kkbh,kkmc from tb_test_kkmc_tmp group by kkbh,kkmc;

编辑shell脚本

#/bin/bash -d
# 数据清洗
echo "数据清洗----START"
# 清除数据下载临时目录
localPath='/tmp/tb_test'
rm -rf $localPath*
mkdir -p $localPath

# 需要下载数据的表
tableName1=tb_test_kk
tableName2=tb_test_kkmc

beginDate=0
endDate=0
batchId=0
function getBacthId (){
   if [ ! $1 ]; then 
      batchId=$(date "+%Y%m%d00")
   else
      batchId=$(date "+%Y%m%d00" -d "$1 days")
   fi
}
# beginDate=getBacthId -1
# endDate=getBacthId

# SQL模板置换字符
getBacthId -1
beginDate=$batchId
sed -i "s/ld>=[0-9]\{10\}/ld>=$beginDate/g" kkdw_used.sql
getBacthId
endDate=$batchId
sed -i "s/ld<[0-9]\{10\}/ld<$endDate/g" kkdw_used.sql
echo "即将运算的数据区间是 $beginDate-$endDate"
echo "算法模板是： "
cat kkdw_used.sql
# 执行模板
/usr/bin/beeline -f /opt/etl/kkdw_etl/kkdw_used.sql

# 获取当天执行批次
getBacthId

# 获取特定表的HDFS路径数据下载
hdfsPath=$(/usr/bin/beeline -e "desc formatted $tableName1" | awk -F' ' '{print $4}' | grep hdfs://)
echo "表的HDFS路径是： $hdfsPath"
/usr/bin/hadoop fs -get $hdfsPath /tmp/
cd "/tmp/$tableName1/"
mv $(ls -1) $localPath/kk-basic-dict-$batchId.conf
echo "数据下载成功，存放目录：$localPath/kk-basic-dict-$batchId.conf"

hdfsPath=$(/usr/bin/beeline -e "desc formatted $tableName2" | awk -F' ' '{print $4}' | grep hdfs://)
echo "表的HDFS路径是： $hdfsPath"
/usr/bin/hadoop fs -get $hdfsPath /tmp/
cd "/tmp/$tableName2/"
mv $(ls -1) $localPath/kkmc-basic-dict-$batchId.conf
echo "数据下载成功，存放目录：$localPath/kkmc-basic-dict-$batchId.conf"

remotePtah='/usr/test/'
# 远程文件拷贝
/usr/bin/expect <<EOF
    set timeout 10
    spawn scp -r $localPath root@12.12.12.17:$remotePtah
    expect {
        "(yes/no)?" {
             send "yes\r";
             exp_continue 
        }
        "password:" {
             send "timebusker\r";        
             exp_continue
        }
    }
EOF
echo "完成远程文件传输：scp $localPath root@12.12.12.17:$remotePtah"
# 远程文件编辑
/usr/bin/expect <<EOF
    set timeout 10
    spawn ssh root@12.12.12.17
    expect {
        "(yes/no)?" {
             send "yes\r";
             exp_continue 
        }
        "password:" {
             send "timebusker\r";        
             exp_continue
        }
    }
    expect "files"
    # 判断远程传输文件是否存在
    send "cd $remotePtah\r"
    send "mv tb_test/* ./\r"
    send "rm -rf tb_test\r"
    send "mv kk-basic-dict-$batchId.conf kk-basic-dict.conf\r"
    expect "是否覆盖"
    send "y\r" 
    send "mv kkmc-basic-dict-$batchId.conf kkmc-basic-dict.conf\r"
    expect "是否覆盖"
    send "y\r"
    expect eof
EOF
echo "完成远程文件编辑.............................."