SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ( (a,b) )
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b
SELECT a, b, SUM( c ) FROM tab1 GROUP BY a, b GROUPING SETS ( (a,b), a)
SELECT a, b, SUM( c ) FROM tab1 GROUP BY a, b UNION SELECT a, null, SUM( c ) FROM tab1 GROUP BY a
SELECT a,b, SUM( c ) FROM tab1 GROUP BY a, b GROUPING SETS (a,b)
SELECT a, null, SUM( c ) FROM tab1 GROUP BY a union SELECT null, b, SUM( c ) FROM tab1 GROUP BY b
SELECT a, b, SUM( c ) FROM tab1 GROUP BY a, b GROUPING SETS ( (a, b), a, b, ( ) )
SELECT a, b, SUM( c ) FROM tab1 GROUP BY a, b union SELECT a, null, SUM( c ) FROM tab1 GROUP BY a, null union SELECT null, b, SUM( c ) FROM tab1 GROUP BY null, b union SELECT null, null, SUM( c ) FROM tab1
Cubes and Rollups
WITH CUBE/ROLLUP 只能用于Group by环境下, Cube 用group by的列集创建了一个具有所有可能组合的子集合。一旦计算了维度集合的cube, 我们就能够获取这些维度的所有可能的聚合操作
cube
group by
GROUP BY a, b, c WITH CUBE
GROUP BY a, b, c GROUPING SETS ((a,b,c), (a, b), (b, c), (a, c), (a), (b), (c), ())
rollup
group by
GROUP BY a, b, c with ROLLUP
GROUP BY a, b, c GROUPING SETS ((a,b,c), (a, b), (a), ())
hive.new.job.grouping.set.cardinality
grouping sets/rollup/cubes 都是导致一个mr的任务被加载,类如: select a, b, c, count(1) from T group by a, b, c with rollup; , 每行数据都是生成四行 (a,b,c), (a,b,null), (a, null, null), (null,null,null), 当table T 容量很大的时候,在mr的过程中就会造成数据膨胀, map端的聚合将不能完成。
INSERT OVERWRITE TABLE pv_gender_agg
SELECT pv_users.gender, count(DISTINCT pv_users.userid), count(DISTINCT pv_users.ip)
FROM pv_users
GROUP BY pv_users.gender;
CREATE TABLE t1(a INTEGER, b INTGER);
SELECT
a,
sum(b)
FROM
t1
GROUP BY
a;
SELECT
a,
b
FROM
t1
GROUP BY
a;
FROM pv_users
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count(DISTINCT pv_users.userid)
GROUP BY pv_users.gender
INSERT OVERWRITE DIRECTORY '/user/facebook/tmp/pv_age_sum'
SELECT pv_users.age, count(DISTINCT pv_users.userid)
GROUP BY pv_users.age;
set hive.map.aggr=true;
SELECT COUNT(*) FROM table2;