数据准备
cookie1,2015-04-10 10:00:02,url2
cookie1,2015-04-10 10:00:00,url1
cookie1,2015-04-10 10:03:04,1url3
cookie1,2015-04-10 10:50:05,url6
cookie1,2015-04-10 11:00:00,url7
cookie1,2015-04-10 10:10:00,url4
cookie1,2015-04-10 10:50:01,url5
cookie2,2015-04-10 10:00:02,url22
cookie2,2015-04-10 10:00:00,url11
cookie2,2015-04-10 10:03:04,1url33
cookie2,2015-04-10 10:50:05,url66
cookie2,2015-04-10 11:00:00,url77
cookie2,2015-04-10 10:10:00,url44
cookie2,2015-04-10 10:50:01,url55
create table cookie2(cookieid string, createtime string, url string) row format delimited fields terminated by ',';
load data local inpath "/root/data/cookie2" into table cookie2;
select * from cookie2;
lag
对数据局部分组排序,
往上
取第N条数据的指定列作为返回值,如果返回值为空,可使用默认值回填
LAG(col,n,DEFAULT) 用于统计窗口内往上第n行值 第一个参数为列名, 第二个参数为往上第n行(可选,默认为1), 第三个参数为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)
select
cookieid,
createtime,
url,
row_number() over (partition by cookieid order by createtime) as rn,
LAG(createtime,1) over (partition by cookieid order by createtime) as last_time11,
LAG(createtime,1,'1970-01-01 00:00:00') over (partition by cookieid order by createtime) as last_time12,
LAG(createtime,2) over (partition by cookieid order by createtime) as last_time21,
LAG(createtime,2,'1970-01-01 00:00:00') over (partition by cookieid order by createtime) as last_time22
from cookie2;
+-----------+----------------------+---------+-----+----------------------+----------------------+----------------------+----------------------+--+
| cookieid | createtime | url | rn | last_time11 | last_time12 | last_time21 | last_time22 |
+-----------+----------------------+---------+-----+----------------------+----------------------+----------------------+----------------------+--+
| cookie1 | 2015-04-10 10:00:00 | url1 | 1 | NULL | 1970-01-01 00:00:00 | NULL | 1970-01-01 00:00:00 |
| cookie1 | 2015-04-10 10:00:02 | url2 | 2 | 2015-04-10 10:00:00 | 2015-04-10 10:00:00 | NULL | 1970-01-01 00:00:00 |
| cookie1 | 2015-04-10 10:03:04 | 1url3 | 3 | 2015-04-10 10:00:02 | 2015-04-10 10:00:02 | 2015-04-10 10:00:00 | 2015-04-10 10:00:00 |
| cookie1 | 2015-04-10 10:10:00 | url4 | 4 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 | 2015-04-10 10:00:02 | 2015-04-10 10:00:02 |
| cookie1 | 2015-04-10 10:50:01 | url5 | 5 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 |
| cookie1 | 2015-04-10 10:50:05 | url6 | 6 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 |
| cookie1 | 2015-04-10 11:00:00 | url7 | 7 | 2015-04-10 10:50:05 | 2015-04-10 10:50:05 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 |
| cookie2 | 2015-04-10 10:00:00 | url11 | 1 | NULL | 1970-01-01 00:00:00 | NULL | 1970-01-01 00:00:00 |
| cookie2 | 2015-04-10 10:00:02 | url22 | 2 | 2015-04-10 10:00:00 | 2015-04-10 10:00:00 | NULL | 1970-01-01 00:00:00 |
| cookie2 | 2015-04-10 10:03:04 | 1url33 | 3 | 2015-04-10 10:00:02 | 2015-04-10 10:00:02 | 2015-04-10 10:00:00 | 2015-04-10 10:00:00 |
| cookie2 | 2015-04-10 10:10:00 | url44 | 4 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 | 2015-04-10 10:00:02 | 2015-04-10 10:00:02 |
| cookie2 | 2015-04-10 10:50:01 | url55 | 5 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 |
| cookie2 | 2015-04-10 10:50:05 | url66 | 6 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 |
| cookie2 | 2015-04-10 11:00:00 | url77 | 7 | 2015-04-10 10:50:05 | 2015-04-10 10:50:05 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 |
+-----------+----------------------+---------+-----+----------------------+----------------------+----------------------+----------------------+--+
lead
对数据局部分组排序,
往下
取第N条数据的指定列作为返回值,如果返回值为空,可使用默认值回填
LEAD(col,n,DEFAULT) 用于统计窗口内往下第n行值 第一个参数为列名, 第二个参数为往下第n行(可选,默认为1), 第三个参数为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)
select
cookieid,
createtime,
url,
row_number() over (partition by cookieid order by createtime) as rn,
LEAD(createtime,1) over (partition by cookieid order by createtime) as last_time11,
LEAD(createtime,1,'1970-01-01 00:00:00') over (partition by cookieid order by createtime) as last_time12,
LEAD(createtime,2) over (partition by cookieid order by createtime) as last_time21,
LEAD(createtime,2,'1970-01-01 00:00:00') over (partition by cookieid order by createtime) as last_time22
from cookie2;
+-----------+----------------------+---------+-----+----------------------+----------------------+----------------------+----------------------+--+
| cookieid | createtime | url | rn | last_time11 | last_time12 | last_time21 | last_time22 |
+-----------+----------------------+---------+-----+----------------------+----------------------+----------------------+----------------------+--+
| cookie1 | 2015-04-10 10:00:00 | url1 | 1 | 2015-04-10 10:00:02 | 2015-04-10 10:00:02 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 |
| cookie1 | 2015-04-10 10:00:02 | url2 | 2 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 |
| cookie1 | 2015-04-10 10:03:04 | 1url3 | 3 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 |
| cookie1 | 2015-04-10 10:10:00 | url4 | 4 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 | 2015-04-10 10:50:05 | 2015-04-10 10:50:05 |
| cookie1 | 2015-04-10 10:50:01 | url5 | 5 | 2015-04-10 10:50:05 | 2015-04-10 10:50:05 | 2015-04-10 11:00:00 | 2015-04-10 11:00:00 |
| cookie1 | 2015-04-10 10:50:05 | url6 | 6 | 2015-04-10 11:00:00 | 2015-04-10 11:00:00 | NULL | 1970-01-01 00:00:00 |
| cookie1 | 2015-04-10 11:00:00 | url7 | 7 | NULL | 1970-01-01 00:00:00 | NULL | 1970-01-01 00:00:00 |
| cookie2 | 2015-04-10 10:00:00 | url11 | 1 | 2015-04-10 10:00:02 | 2015-04-10 10:00:02 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 |
| cookie2 | 2015-04-10 10:00:02 | url22 | 2 | 2015-04-10 10:03:04 | 2015-04-10 10:03:04 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 |
| cookie2 | 2015-04-10 10:03:04 | 1url33 | 3 | 2015-04-10 10:10:00 | 2015-04-10 10:10:00 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 |
| cookie2 | 2015-04-10 10:10:00 | url44 | 4 | 2015-04-10 10:50:01 | 2015-04-10 10:50:01 | 2015-04-10 10:50:05 | 2015-04-10 10:50:05 |
| cookie2 | 2015-04-10 10:50:01 | url55 | 5 | 2015-04-10 10:50:05 | 2015-04-10 10:50:05 | 2015-04-10 11:00:00 | 2015-04-10 11:00:00 |
| cookie2 | 2015-04-10 10:50:05 | url66 | 6 | 2015-04-10 11:00:00 | 2015-04-10 11:00:00 | NULL | 1970-01-01 00:00:00 |
| cookie2 | 2015-04-10 11:00:00 | url77 | 7 | NULL | 1970-01-01 00:00:00 | NULL | 1970-01-01 00:00:00 |
+-----------+----------------------+---------+-----+----------------------+----------------------+----------------------+----------------------+--+
last_value/first_value
last_value
:取分组内排序后,截止到当前行,最后一个值
first_value
:取分组内排序后,截止到当前行,第一个值
select
cookieid,
createtime,
url,
row_number() over(partition by cookieid order by createtime) as rn,
first_value(url) over(partition by cookieid order by createtime) as first_value,
last_value(url) over(partition by cookieid order by createtime) as last_value
from cookie2;
+-----------+----------------------+---------+-----+--------------+-------------+--+
| cookieid | createtime | url | rn | first_value | last_value |
+-----------+----------------------+---------+-----+--------------+-------------+--+
| cookie1 | 2015-04-10 10:00:00 | url1 | 1 | url1 | url1 |
| cookie1 | 2015-04-10 10:00:02 | url2 | 2 | url1 | url2 |
| cookie1 | 2015-04-10 10:03:04 | 1url3 | 3 | url1 | 1url3 |
| cookie1 | 2015-04-10 10:10:00 | url4 | 4 | url1 | url4 |
| cookie1 | 2015-04-10 10:50:01 | url5 | 5 | url1 | url5 |
| cookie1 | 2015-04-10 10:50:05 | url6 | 6 | url1 | url6 |
| cookie1 | 2015-04-10 11:00:00 | url7 | 7 | url1 | url7 |
| cookie2 | 2015-04-10 10:00:00 | url11 | 1 | url11 | url11 |
| cookie2 | 2015-04-10 10:00:02 | url22 | 2 | url11 | url22 |
| cookie2 | 2015-04-10 10:03:04 | 1url33 | 3 | url11 | 1url33 |
| cookie2 | 2015-04-10 10:10:00 | url44 | 4 | url11 | url44 |
| cookie2 | 2015-04-10 10:50:01 | url55 | 5 | url11 | url55 |
| cookie2 | 2015-04-10 10:50:05 | url66 | 6 | url11 | url66 |
| cookie2 | 2015-04-10 11:00:00 | url77 | 7 | url11 | url77 |
+-----------+----------------------+---------+-----+--------------+-------------+--+