大数据常用SQL - xd21303/Notebook GitHub Wiki

大数据常用SQL

TIMESTAMP转换

# 13位时间戳转化为字符串
from_unixtime(cast(timestamps/1000 as int), 'yyyy-MM-dd HH:mm:ss')

to_unix_timestamp('2021-11-30 11:12:55')
# 字符串转化为时间戳
unix_timestamp('2021-11-30 11:12:55') # impala 只支持unix_timestamp

查看表详情

desc extended tablename

日期加减

date_add(start_date, num_days)

length

hive 里中文是1个长度，impala中文是3个长度

中位数

# hive 精确中位数
percentile(cast(trade_amount as BIGINT), 0.5)

# hive 粗略中位数
percentile_approx(trade_amount, 0.5)

# impala
APPX_MEDIAN(trade_amount)

hive 保留小数点后2位

cast(number as decimal(10, 2))

分区表增加字段

cascade 字段会修改所有分区，包括历史。

如果不刷历史，就不要用cascade，使用默认的RESTRICT。

alter table ads.ads_xf_cargo_looking_for_car_model_predict add columns (use_truck_len string comment '司机车长') cascade;
alter table ads.ads_xf_cargo_looking_for_car_model_predict change column use_truck_len use_truck_len string after d2c_distance cascade;

判断星期几

0~6对应星期天到星期六

pmod(datediff('2022-05-08', '1920-01-01') - 3, 7)

# 判断是0，将其变成7
select
  IF(
    pmod(datediff('2022-05-05', '1920-01-01') - 3, 7) = '0',
    7,
    pmod(datediff('2022-05-05', '1920-01-01') - 3, 7)
  )

分割字符串

在hive中，一个中文字符长度为1

在impala中，一个中文字符长度为3

# hive 序号0,1,2
split(sub_market_id, '_')

# impala 序号1,2,3
SPLIT_PART(sub_market_id, '_', 2)

json列表变行

lateral view hive_udf.ymm_explode_json_array(driver_list) t AS user_ids

多行变一行

concat_ws('&', collect_set(config_value))

group by
  config_key

多个列变多行（列转行）

select
  no,
  num
from
  (
    select
      'no' as no,
      '111' as one,
      '222' as two,
      null as three
  ) a lateral view explode(split(concat(one, ',', two), ',')) b as num

正则

REGEXP_EXTRACT((a.`工号`), '[0-9]+', 0)

https://help.aliyun.com/document_detail/27871.htm?spm=a2c4g.11186623.0.0.5279753eTS2Dsr#concept-ihp-ntf-vdb

最大值、最小值

max，min 是针对列

greatest,least 是针对行

生成1到任意数据

WITH Digits AS (
    SELECT 0 AS Number
    UNION SELECT 1
    UNION SELECT 2
    UNION SELECT 3
    UNION SELECT 4
    UNION SELECT 5
    UNION SELECT 6
    UNION SELECT 7
    UNION SELECT 8
    UNION SELECT 9
)

SELECT chapter FROM (
    SELECT
          (d4.Number * 10000)
        + (d3.Number * 1000)
        + (d2.Number * 100)
        + (d1.Number * 10)
        + d0.Number AS chapter
    FROM
          Digits AS d4
        , Digits AS d3
        , Digits AS d2
        , Digits AS d1
        , Digits AS d0
    ) t1
WHERE chapter > 0 AND chapter <= 5000
ORDER BY chapter