Hive—高级操作

By timebusker on February 21, 2018

数据类型

  • array
# 测试数据
1	huangbo	guangzhou,xianggang,shenzhen	a1:30,a2:20,a3:100	beijing,112233,13522334455,500
2	xuzheng	xianggang	b2:50,b3:40	tianjin,223344,13644556677,600
3	wangbaoqiang	beijing,zhejinag	c1:200	chongqinjg,334455,15622334455,20


create table t_data_type(
   id int, 
   name string, 
   work_location array<string>, 
   piaofang map<string,bigint>, 
   address struct<location:string,zipcode:int,phone:string,value:int>
) row format delimited fields terminated by "\t" collection items terminated by "," map keys terminated by ":" lines terminated by "\n";


+-----+---------------+---------------------------------------+-----------------------------+------------------------------------------------------------------------------+--+
| id  |     name      |             work_location             |          piaofang           |                                   address                                    |
+-----+---------------+---------------------------------------+-----------------------------+------------------------------------------------------------------------------+--+
| 1   | huangbo       | ["guangzhou","xianggang","shenzhen"]  | {"a1":30,"a2":20,"a3":100}  | {"location":"beijing","zipcode":112233,"phone":"13522334455","value":500}    |
| 2   | xuzheng       | ["xianggang"]                         | {"b2":50,"b3":40}           | {"location":"tianjin","zipcode":223344,"phone":"13644556677","value":600}    |
| 3   | wangbaoqiang  | ["beijing","zhejinag"]                | {"c1":200}                  | {"location":"chongqinjg","zipcode":334455,"phone":"15622334455","value":20}  |
+-----+---------------+---------------------------------------+-----------------------------+------------------------------------------------------------------------------+--+


select work_location[0] from t_data_type;

+-------------------+--+
| work_location[0]  |
+-------------------+--+
| guangzhou         |
| xianggang         |
| beijing           |
+-------------------+--+
  • map
select piaofang,piaofang["a1"] from t_data_type;

+-----------------------------+---------------+--+
|          piaofang           | piaofang[a1]  |
+-----------------------------+---------------+--+
| {"a1":30,"a2":20,"a3":100}  | 30            |
| {"b2":50,"b3":40}           | NULL          |
| {"c1":200}                  | NULL          |
+-----------------------------+---------------+--+

特殊分隔符处理

hive 读取数据的机制:

  • 首先用 InputFormat<默认是:org.apache.hadoop.mapred.TextInputFormat >的一个具体实 现类读入文件数据,返回一条一条的记录(可以是行,或者是你逻辑中的“行”)
  • 然后利用 SerDe<默认:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe>的一个具体 实现类,对上面返回的一条一条的记录进行字段切割

Hive 对文件中字段的分隔符默认情况下只支持单字节分隔符,对于多字符,可以参考使用RegexSerDe正则表达式解析

create table t_bi_reg(id string,name string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'with serdeproperties('input.regex'='(.*)\\|\\|(.*)','output.format.string'='%1$s %2$s');

01||huangbo
02||xuzheng
03||wangbaoqiang