-- 经纬度信息;
drop table if exists tb_test_kk_tmp;
create table tb_test_kk_tmp as
select kkbh,jd,wd from tb_test where
udf_is_null(kkbh)=1 and udf_is_null(jd)=1 and udf_is_null(wd)=1 and
jd not like '0%' and wd not like '0%' and
ld>=2018122500 and ld<2018122600
union all
select t.* from tb_test_kk t;
-- 名称信息;
drop table if exists tb_test_kkmc_tmp;
create table tb_test_kkmc_tmp as
select kkbh,kkmc from tb_test where
udf_is_null(kkbh)=1 and udf_is_null(kkmc)=1 and
ld>=2018122500 and ld<2018122600
union all
select t.* from tb_test_kkmc t;
-- 设置reduce任务数,避免产生大量小文件;
set mapred.reduce.tasks=1;
drop table if exists tb_test_kk;
create table tb_test_kk(
kkbh string,
jd string,
wd string
)row format delimited
fields terminated by ' '
lines terminated by '\n';
insert into tb_test_kk
select kkbh,jd,wd from tb_test_kk_tmp group by kkbh,jd,wd;
drop table if exists tb_test_kkmc;
create table tb_test_kkmc(
kkbh string,
kkmc string
)row format delimited
fields terminated by ' '
lines terminated by '\n';
insert into tb_test_kkmc
select kkbh,kkmc from tb_test_kkmc_tmp group by kkbh,kkmc;
#/bin/bash -d
# 数据清洗
echo "数据清洗----START"
# 清除数据下载临时目录
localPath='/tmp/tb_test'
rm -rf $localPath*
mkdir -p $localPath
# 需要下载数据的表
tableName1=tb_test_kk
tableName2=tb_test_kkmc
beginDate=0
endDate=0
batchId=0
function getBacthId (){
if [ ! $1 ]; then
batchId=$(date "+%Y%m%d00")
else
batchId=$(date "+%Y%m%d00" -d "$1 days")
fi
}
# beginDate=getBacthId -1
# endDate=getBacthId
# SQL模板置换字符
getBacthId -1
beginDate=$batchId
sed -i "s/ld>=[0-9]\{10\}/ld>=$beginDate/g" kkdw_used.sql
getBacthId
endDate=$batchId
sed -i "s/ld<[0-9]\{10\}/ld<$endDate/g" kkdw_used.sql
echo "即将运算的数据区间是 $beginDate-$endDate"
echo "算法模板是: "
cat kkdw_used.sql
# 执行模板
/usr/bin/beeline -f /opt/etl/kkdw_etl/kkdw_used.sql
# 获取当天执行批次
getBacthId
# 获取特定表的HDFS路径数据下载
hdfsPath=$(/usr/bin/beeline -e "desc formatted $tableName1" | awk -F' ' '{print $4}' | grep hdfs://)
echo "表的HDFS路径是: $hdfsPath"
/usr/bin/hadoop fs -get $hdfsPath /tmp/
cd "/tmp/$tableName1/"
mv $(ls -1) $localPath/kk-basic-dict-$batchId.conf
echo "数据下载成功,存放目录:$localPath/kk-basic-dict-$batchId.conf"
hdfsPath=$(/usr/bin/beeline -e "desc formatted $tableName2" | awk -F' ' '{print $4}' | grep hdfs://)
echo "表的HDFS路径是: $hdfsPath"
/usr/bin/hadoop fs -get $hdfsPath /tmp/
cd "/tmp/$tableName2/"
mv $(ls -1) $localPath/kkmc-basic-dict-$batchId.conf
echo "数据下载成功,存放目录:$localPath/kkmc-basic-dict-$batchId.conf"
remotePtah='/usr/test/'
# 远程文件拷贝
/usr/bin/expect <<EOF
set timeout 10
spawn scp -r $localPath root@12.12.12.17:$remotePtah
expect {
"(yes/no)?" {
send "yes\r";
exp_continue
}
"password:" {
send "timebusker\r";
exp_continue
}
}
EOF
echo "完成远程文件传输:scp $localPath root@12.12.12.17:$remotePtah"
# 远程文件编辑
/usr/bin/expect <<EOF
set timeout 10
spawn ssh root@12.12.12.17
expect {
"(yes/no)?" {
send "yes\r";
exp_continue
}
"password:" {
send "timebusker\r";
exp_continue
}
}
expect "files"
# 判断远程传输文件是否存在
send "cd $remotePtah\r"
send "mv tb_test/* ./\r"
send "rm -rf tb_test\r"
send "mv kk-basic-dict-$batchId.conf kk-basic-dict.conf\r"
expect "是否覆盖"
send "y\r"
send "mv kkmc-basic-dict-$batchId.conf kkmc-basic-dict.conf\r"
expect "是否覆盖"
send "y\r"
expect eof
EOF
echo "完成远程文件编辑.............................."