炒冷饭,一道16年前的北美OUG‐SQL挑战赛回顾‐2 - l1t1/note GitHub Wiki
无意中发现,newkid在《剑破冰山》一书中期待的在递归CTE中使用聚合函数的写法在duckdb v1.1.3中已经实现了,
WITH recursive t(v,p,lvl) AS (
SELECT face_value v
,probability p
,1
FROM die
UNION ALL
SELECT d.face_value + t.v ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
,SUM(d.probability * t.p) ---- 把相同的总面值的概率加起来
,MAX(t.lvl)+1
FROM die d,t
WHERE t.lvl<2000
GROUP BY d.face_value + t.v ---- 按照总面值分组聚合,再做下一次递归
)
SELECT *
FROM t
WHERE lvl = 2000
order by 1;
│ 15998 │ 0.0 │ 2000 │
│ 16000 │ 0.0 │ 2000 │
├───────┴────────┴───────┤
│ 13999 rows (40 shown) │
└────────────────────────┘
Run Time (s): real 2.997 user 6.000000 sys 2.125000
用快速幂法写了个双重递归,N=100时和newkid的单次递归也差不多快,N=2000时还比单次递归用时多一倍多。
with recursive
t(v,p,lvl) AS (
SELECT face_value v
,probability p
,1
FROM die
UNION ALL
SELECT d.v + t.v ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
,SUM(d.p * t.p) ---- 把相同的总面值的概率加起来
,MAX(t.lvl)*2
FROM t d,t
WHERE t.lvl<2000
GROUP BY d.v + t.v ---- 按照总面值分组聚合,再做下一次递归
),t1 as(select *,dense_rank()over(order by lvl)rk from t where lvl & 2000<>0)
, t2(v, p, rk) as(select 0 v, 1::double p, 0 rk
union all
select t1.v + t2.v, sum(t1.p * t2.p),max(t1.rk)from t1,t2 where t1.rk=t2.rk+1 GROUP BY t1.v + t2.v)
select v,p from t2 where rk=(select max(rk) from t2)
order by 1;
│ 16000 │ 0.0 │
├───────┴────────┤
│ 13999 rows │
│ (40 shown) │
└────────────────┘
Run Time (s): real 6.965 user 18.281250 sys 0.109375
用了materialized
关键字才比单次递归快了。
with recursive
t(v,p,lvl)AS materialized (
SELECT face_value v
,probability p
,1
FROM die
UNION ALL
SELECT d.v + t.v ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
,SUM(d.p * t.p) ---- 把相同的总面值的概率加起来
,MAX(t.lvl)*2
FROM t d,t
WHERE t.lvl<2000
GROUP BY d.v + t.v ---- 按照总面值分组聚合,再做下一次递归
),t1 as materialized (select *,dense_rank()over(order by lvl)rk from t where lvl & 2000<>0)
, t2(v, p, rk) as materialized (select 0 v, 1::double p, 0 rk
union all
select t1.v + t2.v, sum(t1.p * t2.p),max(t1.rk)from t1,t2 where t1.rk=t2.rk+1 GROUP BY t1.v + t2.v)
select v,p from t2 where rk=(select max(rk) from t2)
order by 1;
│ 15998 │ 0.0 │
│ 16000 │ 0.0 │
├───────┴────────┤
│ 13999 rows │
│ (40 shown) │
└────────────────┘
Run Time (s): real 0.812 user 2.718750 sys 0.000000
上述2000次的结果其实没有意义,因为超过了double类型的取值范围,一律以0表示了。 postgresql还不支持在递归CTE中使用聚合函数,但可以使用分析函数,而且它支持高精度的decimal,比如小数点后390位,把单次递归改写如下:
postgres=# WITH recursive t(v,p,lvl,rn) AS (
SELECT face_value v
,probability p
,1
,1
FROM die
UNION ALL
SELECT d.face_value + t.v ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
,SUM(d.probability * t.p)over(partition BY d.face_value + t.v)::decimal(400,390) ---- 把相同的总面值的概率加起来
,(t.lvl)+1 ,row_number()over(partition BY d.face_value + t.v order by 1) ::int
FROM die d,t
WHERE t.lvl<100 and t.rn=1
---- 按照总面值分组聚合,再做下一次递归
)
select count(*),max(p)::float,min(p)::float from(
SELECT v,sum(p)p
FROM t
WHERE lvl = 100 and rn=1
group by v)
;
count | max | min
-------+----------------------+-------------------------
699 | 0.019748498208970528 | 1.2074673472413666e-108
(1 row)
Time: 671.816 ms
N=300时,最小值已经无法用双精度浮点表示了,改用科学计数法:
select count(*),max(p)::float,to_char(min(p), '9.99EEEE') from(
SELECT v,sum(p)p
FROM t
WHERE lvl = 300 and rn=1
group by v)
;
count | max | to_char
-------+----------------------+------------
2099 | 0.011405066538137177 | 1.76e-324
(1 row)
Time: 7731.950 ms (00:07.732)
postgresql支持union 后的子句中有子查询,所以能优化如下
WITH recursive t(v,p,lvl,rn) AS (
SELECT face_value v
,probability p
,1
,1
FROM die
UNION ALL
select * from(
SELECT d.face_value + t.v ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
,SUM(d.probability * t.p)over(partition BY d.face_value + t.v)::decimal(400,390) ---- 把相同的总面值的概率加起来
,(t.lvl)+1 ,row_number()over(partition BY d.face_value + t.v order by 1) ::int rn
FROM die d,t
WHERE t.lvl<300 and t.rn=1
---- 按照总面值分组聚合,再做下一次递归
)b where rn=1
)
select count(*),max(p)::float,to_char(min(p), '9.99EEEE') from(
SELECT v,sum(p)p
FROM t
WHERE lvl = 300 and rn=1
group by v)a
;
count | max | to_char
-------+--------------------+------------
2099 | 0.0114050665381372 | 1.76e-324
(1 行记录)
时间:6356.822 ms
postgresql不许递归引用的表出现多次,报错如下,所以无法改用分析函数实现快速幂。
ERROR: recursive reference to query "t" must not appear more than once
LINE 11: ,(t.lvl)*2 ,row_number()over(partition BY d.face_...
Oracle 19c垫底,递归CTE可以使用分析函数,但精度不如postgresql,速度也不如postgresql,单次递归算到100都要26秒,怪不得当时参赛选手纷纷采取极限优化手段。
SQL> WITH t(v,p,lvl,rn) AS (
SELECT face_value v
,probability p
,1
,1
FROM die
UNION ALL
SELECT d.face_value + t.v ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
,SUM(d.probability * t.p)over(partition BY d.face_value + t.v) ---- 把相同的总面值的概率加起来
,(t.lvl)+1 ,row_number()over(partition BY d.face_value + t.v order by 1)
FROM die d,t
WHERE t.lvl<100 and t.rn=1
---- 按照总面值分组聚合,再做下一次递归
)
select count(*) from(
SELECT v,sum(p)p
FROM t
WHERE lvl = 100 and rn=1
group by v)
;
COUNT(*)
----------
699
Elapsed: 00:00:26.55