数据准备
原始数据是以::进行切分的,所以需要使用能解析多字节分隔符的Serde即可,使用RegexSerde需要两个参数:
`input.regex = "(.*)::(.*)::(.*)"`
`output.format.string = "%1$s %2$s %3$s"`
- 用户表
# 数据格式-2::M::56::16::70072
# 用户id,性别,年龄,职业,邮政编码
drop table if exists t_user;
create table t_user(
userid bigint,
sex string,
age int,
occupation string,
zipcode string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s');
load data local inpath '/root/data/users.dat' into table t_user;
select * from t_user limit 10;
- 电影表
# 字段为:MovieID BigInt, Title String, Genres String
# 字段中文解释:电影ID,电影名字,电影类型
create table t_movie(
movieid bigint,
moviename string,
movietype string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s');
load data local inpath '/root/data/movies.dat' into table t_movie;
select * from t_movie limit 10;
- 评分表
# 字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
# 字段中文解释:用户ID,电影ID,评分,评分时间戳
create table t_rating(
userid bigint,
movieid bigint,
rate double,
times string
) row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s');
load data local inpath '/root/data/ratings.dat' into table t_rating;
select * from t_rating limit 10;
求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
# 评分次数最多的电影
drop table if exists t_rating_tmp;
create table t_rating_tmp as
select a.moviename as moviename,count(a.moviename) as total from t_movie a join t_rating b on a.movieid=b.movieid
group by a.moviename order by total desc limit 10;
分别求男性,女性当中评分最高的10部电影且评论次数大于等于50次
drop table if exists t_rating_tmp;
create table t_rating_tmp as
select sex,moviename, avg(rate) as avg_rate, count(moviename) as total
from t_rating a join t_user b on a.userid=b.userid join t_movie c on a.movieid=c.movieid
where b.sex="F" group by b.sex,c.moviename having total >= 50 order by avg_rate desc limit 10;
insert into t_rating_tmp
select sex,moviename, avg(rate) as avg_rate, count(moviename) as total
from t_rating a join t_user b on a.userid=b.userid join t_movie c on a.movieid=c.movieid
where b.sex="M" group by b.sex,c.moviename having total >= 50 order by avg_rate desc limit 10;
各年龄段的平均影评
select ages,avg(rate) rate from (
select (case when age<=18 then '0-18' when (age>18 and age<=30) then '19-30' when (age>30 and age <=40) then '31-40' when (age>40 and age<=50) then '41-50' else '>=51' end) as ages,rate
from t_rating r join t_user u on r.userid=u.userid
) t group by ages;
最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
select a.userid, count(a.userid) as total
from t_rating a join t_user b on a.userid = b.userid
where b.sex="F" group by a.userid order by total desc limit 1;
create table t_rating_tmp1 as
select a.movieid as movieid, a.rate as rate
from t_rating a where a.userid=1150 order by rate desc limit 10;
create table t_rating_tmp2 as
select b.movieid as movieid, c.moviename as moviename, avg(b.rate) as avgrate
from t_rating_tmp1 a join t_rating b on a.movieid=b.movieid join t_movie c on b.movieid=c.movieid
group by b.movieid,c.moviename;