SUM(Salary) OVER (PARTITION BY Groupname ORDER BY ID)
对PARTITION BY后面的列Groupname进行分组,然后按ORDER BY 后的ID进行排序,然后在组内对Salary进行累加处理。
SUM(Salary) OVER (ORDER BY ID)
只对ORDER BY 后的ID内容进行排序,对排完序后的Salary进行累加处理。
SUM(Salary) OVER ()
对Salary进行汇总处理
这里的主标题就是开窗函数。
比较容易出错而且难以理解的是:COUNT(DISTINCT wuid) AS unique_wuid_count, SUM(COUNT(DISTINCT wuid)) OVER() AS total_unique_wuid_count from这样的设计。左列的累积和不一定等于右边的值,这样的bug比较难调试处理。可能的原因是跨分区可能存在重复值,SUM(COUNT(DISTINCT wuid)) OVER()会导致更多的去重,导致 total_unique_wuid_count 不等于的累加。
选出通话时间最早的一条记录:
1 2 3 4 5 6 7
select a,l,created_time,row_number() over(partitionby a,l orderby created_time asc) as rn from tmp where impl_date=%(i_datefrom)s and to_char(created_time, 'yyyyMMdd') between %(i_date_pre_m1)s and%(i_datefrom)s) tmp_cri where rn=1
case when
嵌套 case when
在select 的一个目标列里:
1 2
-- 在减法以前有一个 case when。在减法外有一个 case when,when 和 then 之间是一个减法 casewhen ((casewhen cri.call_time isnullthen9999999999else unix_timestamp(cri.call_time) end)-unix_timestamp(a.created_time)) <=7*24*60*60then1else0endas is_call_in_time
WITH used_names AS ( SELECT'C'AS cp_name UNIONALLSELECT'D' ), all_names AS ( SELECT'A'AS name UNIONALLSELECT name FROM used_names) -- 每多一行,加入一个UNION ALL,各行可以不加 AS name INSERT OVERWRITE TABLE db.tbl_1 PARTITION (ds ='${last_day}') SELECT a, b
FROM db.tbl_2 a INNERJOIN all_names n ON a.cp_name = n.cp_name LEFTJOIN used_names u ON a.cp_name = u.cp_name
first_call_dates AS ( SELECT phone_no, MIN(call_date) AS first_call_date FROM all_history_calls GROUPBY phone_no ))
-- 最终聚合结果 SELECT t.call_date AS stat_date, COUNT(t.phone_no) AS total_calls_today, COUNT(CASEWHEN f.first_call_date = t.call_date -- 如果统计全行业,则去掉这个 AND AND GET_JSON_OBJECT(t.content, '$.info.bizType') IN ('life','dining','life_cpc_reach_store','life_cpc_popularized','home_industry_customer_service','life_cpc_latent') THEN1END) AS first_time_called_today FROM today_calls t LEFTJOIN first_call_dates f ON t.phone_no = f.phone_no GROUPBY t.call_date;
对于同一个结果表而言,我们在 select 里再进行 case when,可以保证 where 里取出全量数据,然后在其他查询衍生子句里产生其他过滤列和过滤列的聚合结果。
WHERE、CASE WHEN和HAVING的核心区别
WHERE
作用:过滤原始数据行
执行时机:第一步执行,在数据检索时
核心特点:
用于行级过滤,决定哪些行参与后续处理
不能使用聚合函数
性能最高,因为它减少了需要处理的数据量
过滤发生在分组之前
示例:
1 2
WHERE GET_JSON_OBJECT(content, '$.log_type') ='call_end' AND SUBSTR(GET_JSON_OBJECT(content, '$.data.start_time'), 1, 10) ='2025-12-07'
SELECT phone_no, call_date, call_status, -- 根据通话状态创建中文描述 CASE WHEN call_status ='ANSWERED'THEN'已接听' WHEN call_status ='NO_ANSWER'THEN'未接听' WHEN call_status ='BUSY'THEN'占线' ELSE'其他状态' ENDAS status_description, -- 根据通话时长分类 CASE WHEN duration <30THEN'短通话' WHEN duration BETWEEN30AND120THEN'中等通话' WHEN duration >120THEN'长通话' ELSE'未知' ENDAS duration_category FROM calls;
SELECT call_date, COUNT(*) AS total_calls, -- 统计接听的通话数 COUNT(CASEWHEN call_status ='ANSWERED'THEN1END) AS answered_calls, -- 统计未接听的通话数 COUNT(CASEWHEN call_status IN ('NO_ANSWER', 'BUSY') THEN1END) AS unanswered_calls, -- 计算接听率 ROUND( COUNT(CASEWHEN call_status ='ANSWERED'THEN1END) *100.0/COUNT(*), 2 ) AS answer_rate, -- 计算平均有效通话时长(只计算接听的) AVG(CASEWHEN call_status ='ANSWERED'THEN duration END) AS avg_effective_duration FROM calls GROUPBY call_date;
SELECT* FROM calls WHERE -- 根据不同业务类型设置不同的通话时长要求 CASE WHEN biz_type ='life'THEN duration >=30 WHEN biz_type ='dining'THEN duration >=60 WHEN biz_type ='home_industry'THEN duration >=45 ELSE duration >=15 END=TRUE -- 并且满足其他条件 AND call_status ='ANSWERED';
SELECT* FROM calls WHERE -- 工作日和周末使用不同的时间范围 CASE WHEN DAYOFWEEK(call_date) IN (1, 7) THEN-- 周末 start_time BETWEEN'09:00:00'AND'20:00:00' ELSE-- 工作日 start_time BETWEEN'08:00:00'AND'18:00:00' END=TRUE;
SELECT -- 根据通话时长动态分组 CASE WHEN duration <30THEN'0-30秒' WHEN duration BETWEEN30AND60THEN'30-60秒' WHEN duration BETWEEN61AND120THEN'1-2分钟' WHEN duration >120THEN'2分钟以上' ELSE'未知' ENDAS duration_group, COUNT(*) AS call_count, AVG(duration) AS avg_duration FROM calls GROUPBY CASE WHEN duration <30THEN'0-30秒' WHEN duration BETWEEN30AND60THEN'30-60秒' WHEN duration BETWEEN61AND120THEN'1-2分钟' WHEN duration >120THEN'2分钟以上' ELSE'未知' END ORDERBY avg_duration;
SELECT phone_no, biz_type, call_status, duration FROM calls ORDERBY -- 按业务类型重要性排序,1是升序排序的第一位 CASE biz_type WHEN'life'THEN1 WHEN'dining'THEN2 WHEN'home_industry'THEN3 ELSE4 END, -- 然后按通话状态排序 CASE call_status WHEN'ANSWERED'THEN1 WHEN'NO_ANSWER'THEN2 WHEN'BUSY'THEN3 ELSE4 END, -- 最后按时长降序排列 duration DESC;
SELECT phone_no FROM calls GROUPBY phone_no HAVING -- 只保留接听率超过80%的号码 COUNT(CASEWHEN call_status ='ANSWERED'THEN1END) *100.0/COUNT(*) >80 -- 并且总通话次数大于10次 ANDCOUNT(*) >10;
case when 本质上还是一种 expression,WHEN biz_type = 'life' THEN duration >= 30返回的还是布尔表达式。
-- 这些都是有效的SELECT语句 SELECT1FROM table_name; -- 每行都返回1 SELECT'constant_value'FROM table_name; -- 每行都返回相同字符串 SELECT1AS row_indicator FROM table_name; -- 每行都返回1,并起别名 SELECT phone_no, 1AS flag FROM calls; -- 返回实际列+常量列
-- 假设有一张表,其中some_column有NULL值: -- | id | some_column | -- |----|-------------| -- | 1 | A | -- | 2 | NULL | -- | 3 | B |
SELECT COUNT(*) AS total_rows, -- 结果: 3 COUNT(1) AS also_total_rows, -- 结果: 3 COUNT(some_column) AS non_null_values -- 结果: 2 FROM table_name;
-- 统计行数的不同方式 SELECT COUNT(*) AS row_count_method1, SUM(1) AS row_count_method2, -- 每行加1,等效于COUNT(*) SUM(2) AS double_row_count -- 每行加2,等于行数×2 FROM calls;