Hive实战—影评案例

By timebusker on February 23, 2018

数据准备

原始数据是以::进行切分的,所以需要使用能解析多字节分隔符的Serde即可,使用RegexSerde需要两个参数:

`input.regex = "(.*)::(.*)::(.*)"`

`output.format.string = "%1$s %2$s %3$s"`
  • 用户表
# 数据格式-2::M::56::16::70072
# 用户id,性别,年龄,职业,邮政编码

drop table if exists t_user;
create table t_user(
   userid bigint,
   sex string,
   age int,
   occupation string,
   zipcode string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s');

load data local inpath '/root/data/users.dat' into table t_user;
select * from t_user limit 10;
  • 电影表
# 字段为:MovieID BigInt, Title String, Genres String
# 字段中文解释:电影ID,电影名字,电影类型
create table t_movie(
   movieid bigint,
   moviename string,
   movietype string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s');

load data local inpath '/root/data/movies.dat' into table t_movie;
select * from t_movie limit 10;
  • 评分表
# 字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
# 字段中文解释:用户ID,电影ID,评分,评分时间戳
create table t_rating(
   userid bigint,
   movieid bigint,
   rate double,
   times string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s');

load data local inpath '/root/data/ratings.dat' into table t_rating;
select * from t_rating limit 10;
求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
# 评分次数最多的电影
drop table if exists t_rating_tmp;
create table t_rating_tmp as 
select a.moviename as moviename,count(a.moviename) as total from t_movie a join t_rating b on a.movieid=b.movieid 
group by a.moviename order by total desc limit 10;
分别求男性,女性当中评分最高的10部电影且评论次数大于等于50次
drop table if exists t_rating_tmp;
create table t_rating_tmp as 
select sex,moviename, avg(rate) as avg_rate, count(moviename) as total  
from t_rating a join t_user b on a.userid=b.userid join t_movie c on a.movieid=c.movieid 
where b.sex="F" group by b.sex,c.moviename having total >= 50 order by avg_rate desc limit 10;

insert into t_rating_tmp 
select sex,moviename, avg(rate) as avg_rate, count(moviename) as total  
from t_rating a join t_user b on a.userid=b.userid join t_movie c on a.movieid=c.movieid 
where b.sex="M" group by b.sex,c.moviename having total >= 50 order by avg_rate desc limit 10;
各年龄段的平均影评
select ages,avg(rate) rate from (
select (case when age<=18 then '0-18' when (age>18 and age<=30) then '19-30' when (age>30 and age <=40) then '31-40' when (age>40 and age<=50) then '41-50' else '>=51' end) as ages,rate 
from t_rating r join t_user u on r.userid=u.userid
) t group by ages;
最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
select a.userid, count(a.userid) as total 
from t_rating a join t_user b on a.userid = b.userid 
where b.sex="F" group by a.userid order by total desc limit 1;

create table t_rating_tmp1 as 
select a.movieid as movieid, a.rate as rate  
from t_rating a where a.userid=1150 order by rate desc limit 10;

create table t_rating_tmp2 as 
select b.movieid as movieid, c.moviename as moviename, avg(b.rate) as avgrate 
from t_rating_tmp1 a join t_rating b on a.movieid=b.movieid join t_movie c on b.movieid=c.movieid 
group by b.movieid,c.moviename;