Hive实战—影评案例 - timebusker的博客

数据准备

原始数据是以::进行切分的，所以需要使用能解析多字节分隔符的Serde即可，使用RegexSerde需要两个参数：

`input.regex = "(.*)::(.*)::(.*)"`

`output.format.string = "%1$s %2$s %3$s"`

用户表

# 数据格式-2::M::56::16::70072
# 用户id，性别，年龄，职业，邮政编码

drop table if exists t_user;
create table t_user(
   userid bigint,
   sex string,
   age int,
   occupation string,
   zipcode string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s');

load data local inpath '/root/data/users.dat' into table t_user;
select * from t_user limit 10;

电影表

# 字段为：MovieID BigInt, Title String, Genres String
# 字段中文解释：电影ID，电影名字，电影类型
create table t_movie(
   movieid bigint,
   moviename string,
   movietype string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s');

load data local inpath '/root/data/movies.dat' into table t_movie;
select * from t_movie limit 10;

评分表

# 字段为：UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
# 字段中文解释：用户ID，电影ID，评分，评分时间戳
create table t_rating(
   userid bigint,
   movieid bigint,
   rate double,
   times string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s');

load data local inpath '/root/data/ratings.dat' into table t_rating;
select * from t_rating limit 10;

求被评分次数最多的10部电影，并给出评分次数（电影名，评分次数）

# 评分次数最多的电影
drop table if exists t_rating_tmp;
create table t_rating_tmp as 
select a.moviename as moviename,count(a.moviename) as total from t_movie a join t_rating b on a.movieid=b.movieid 
group by a.moviename order by total desc limit 10;

分别求男性，女性当中评分最高的10部电影且`评论次数大于等于50次`

drop table if exists t_rating_tmp;
create table t_rating_tmp as 
select sex,moviename, avg(rate) as avg_rate, count(moviename) as total  
from t_rating a join t_user b on a.userid=b.userid join t_movie c on a.movieid=c.movieid 
where b.sex="F" group by b.sex,c.moviename having total >= 50 order by avg_rate desc limit 10;

insert into t_rating_tmp 
select sex,moviename, avg(rate) as avg_rate, count(moviename) as total  
from t_rating a join t_user b on a.userid=b.userid join t_movie c on a.movieid=c.movieid 
where b.sex="M" group by b.sex,c.moviename having total >= 50 order by avg_rate desc limit 10;

各年龄段的平均影评

select ages,avg(rate) rate from (
select (case when age<=18 then '0-18' when (age>18 and age<=30) then '19-30' when (age>30 and age <=40) then '31-40' when (age>40 and age<=50) then '41-50' else '>=51' end) as ages,rate 
from t_rating r join t_user u on r.userid=u.userid
) t group by ages;

最喜欢看电影（影评次数最多）的那位女性评最高分的10部电影的平均影评分（观影者，电影名，影评分）

select a.userid, count(a.userid) as total 
from t_rating a join t_user b on a.userid = b.userid 
where b.sex="F" group by a.userid order by total desc limit 1;

create table t_rating_tmp1 as 
select a.movieid as movieid, a.rate as rate  
from t_rating a where a.userid=1150 order by rate desc limit 10;

create table t_rating_tmp2 as 
select b.movieid as movieid, c.moviename as moviename, avg(b.rate) as avgrate 
from t_rating_tmp1 a join t_rating b on a.movieid=b.movieid join t_movie c on b.movieid=c.movieid 
group by b.movieid,c.moviename;

数据准备

求被评分次数最多的10部电影，并给出评分次数（电影名，评分次数）

分别求男性，女性当中评分最高的10部电影且评论次数大于等于50次

各年龄段的平均影评

最喜欢看电影（影评次数最多）的那位女性评最高分的10部电影的平均影评分（观影者，电影名，影评分）

目录

分别求男性，女性当中评分最高的10部电影且`评论次数大于等于50次`