炒冷饭,一道16年前的北美OUG‐SQL挑战赛回顾‐2 - l1t1/note GitHub Wiki

无意中发现,newkid在《剑破冰山》一书中期待的在递归CTE中使用聚合函数的写法在duckdb v1.1.3中已经实现了,

WITH recursive t(v,p,lvl) AS (
   SELECT face_value  v
         ,probability p
         ,1
     FROM die
   UNION ALL
   SELECT d.face_value + t.v       ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
         ,SUM(d.probability * t.p) ---- 把相同的总面值的概率加起来
         ,MAX(t.lvl)+1             
     FROM die d,t
    WHERE t.lvl<2000
    GROUP BY d.face_value + t.v    ---- 按照总面值分组聚合,再做下一次递归
    )
SELECT *
  FROM t
 WHERE lvl = 2000
 order by 1;
 
│ 15998 │    0.0 │  2000 │
│ 16000 │    0.0 │  2000 │
├───────┴────────┴───────┤
│ 13999 rows (40 shown)  │
└────────────────────────┘
Run Time (s): real 2.997 user 6.000000 sys 2.125000

用快速幂法写了个双重递归,N=100时和newkid的单次递归也差不多快,N=2000时还比单次递归用时多一倍多。

with recursive 
   t(v,p,lvl) AS (
     SELECT face_value  v
           ,probability p
           ,1
       FROM die
     UNION ALL
     SELECT d.v + t.v       ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
           ,SUM(d.p * t.p) ---- 把相同的总面值的概率加起来
           ,MAX(t.lvl)*2             
       FROM t d,t
      WHERE t.lvl<2000
      GROUP BY d.v + t.v    ---- 按照总面值分组聚合,再做下一次递归
),t1 as(select *,dense_rank()over(order by lvl)rk from t where lvl & 2000<>0)
, t2(v, p, rk) as(select 0 v, 1::double p, 0 rk
union all
select t1.v + t2.v, sum(t1.p * t2.p),max(t1.rk)from t1,t2 where t1.rk=t2.rk+1 GROUP BY t1.v + t2.v)
select v,p from t2 where rk=(select max(rk) from t2)
order by 1; 

│ 16000 │    0.0 │
├───────┴────────┤
│   13999 rows   │
│   (40 shown)   │
└────────────────┘
Run Time (s): real 6.965 user 18.281250 sys 0.109375

用了materialized关键字才比单次递归快了。

with recursive 
   t(v,p,lvl)AS materialized (
     SELECT face_value  v
           ,probability p
           ,1
       FROM die
     UNION ALL
     SELECT d.v + t.v       ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
           ,SUM(d.p * t.p) ---- 把相同的总面值的概率加起来
           ,MAX(t.lvl)*2             
       FROM t d,t
      WHERE t.lvl<2000
      GROUP BY d.v + t.v    ---- 按照总面值分组聚合,再做下一次递归
),t1 as materialized (select *,dense_rank()over(order by lvl)rk from t where lvl & 2000<>0)
, t2(v, p, rk) as materialized (select 0 v, 1::double p, 0 rk
union all
select t1.v + t2.v, sum(t1.p * t2.p),max(t1.rk)from t1,t2 where t1.rk=t2.rk+1 GROUP BY t1.v + t2.v)
select v,p from t2 where rk=(select max(rk) from t2)
order by 1; 
│ 15998 │    0.0 │
│ 16000 │    0.0 │
├───────┴────────┤
│   13999 rows   │
│   (40 shown)   │
└────────────────┘
Run Time (s): real 0.812 user 2.718750 sys 0.000000

上述2000次的结果其实没有意义,因为超过了double类型的取值范围,一律以0表示了。 postgresql还不支持在递归CTE中使用聚合函数,但可以使用分析函数,而且它支持高精度的decimal,比如小数点后390位,把单次递归改写如下:

postgres=# WITH recursive t(v,p,lvl,rn) AS (
     SELECT face_value  v
           ,probability p
           ,1
,1
       FROM die
     UNION ALL
     SELECT d.face_value + t.v       ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
           ,SUM(d.probability * t.p)over(partition BY d.face_value + t.v)::decimal(400,390) ---- 把相同的总面值的概率加起来
           ,(t.lvl)+1 ,row_number()over(partition BY d.face_value + t.v order by 1)  ::int          
       FROM die d,t
      WHERE t.lvl<100 and t.rn=1
          ---- 按照总面值分组聚合,再做下一次递归
      )
select count(*),max(p)::float,min(p)::float from(
  SELECT v,sum(p)p 
    FROM t
   WHERE lvl = 100 and rn=1
group by v)
;
 count |         max          |           min           
-------+----------------------+-------------------------
   699 | 0.019748498208970528 | 1.2074673472413666e-108
(1 row)

Time: 671.816 ms

N=300时,最小值已经无法用双精度浮点表示了,改用科学计数法:

select count(*),max(p)::float,to_char(min(p), '9.99EEEE') from(
  SELECT v,sum(p)p 
    FROM t
   WHERE lvl = 300 and rn=1
group by v)
;


 count |         max          |  to_char   
-------+----------------------+------------
  2099 | 0.011405066538137177 |  1.76e-324
(1 row)

Time: 7731.950 ms (00:07.732)

postgresql支持union 后的子句中有子查询,所以能优化如下

WITH recursive t(v,p,lvl,rn) AS (
     SELECT face_value  v
           ,probability p
           ,1
,1
       FROM die
     UNION ALL
select * from(
     SELECT d.face_value + t.v       ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
           ,SUM(d.probability * t.p)over(partition BY d.face_value + t.v)::decimal(400,390) ---- 把相同的总面值的概率加起来
           ,(t.lvl)+1 ,row_number()over(partition BY d.face_value + t.v order by 1)  ::int  rn        
       FROM die d,t
      WHERE t.lvl<300 and t.rn=1
          ---- 按照总面值分组聚合,再做下一次递归
      )b where rn=1
)
select count(*),max(p)::float,to_char(min(p), '9.99EEEE') from(
  SELECT v,sum(p)p 
    FROM t
   WHERE lvl = 300 and rn=1
group by v)a
;

 count |        max         |  to_char   
-------+--------------------+------------
  2099 | 0.0114050665381372 |  1.76e-324
(1 行记录)

时间:6356.822 ms

postgresql不许递归引用的表出现多次,报错如下,所以无法改用分析函数实现快速幂。

ERROR:  recursive reference to query "t" must not appear more than once
LINE 11:            ,(t.lvl)*2 ,row_number()over(partition BY d.face_...
         

Oracle 19c垫底,递归CTE可以使用分析函数,但精度不如postgresql,速度也不如postgresql,单次递归算到100都要26秒,怪不得当时参赛选手纷纷采取极限优化手段。

SQL> WITH  t(v,p,lvl,rn) AS (
     SELECT face_value  v
           ,probability p
           ,1
,1
       FROM die
     UNION ALL
     SELECT d.face_value + t.v       ---- 第N次投掷,面值为前N-1次的各种结果加上die表的六种可能面值
           ,SUM(d.probability * t.p)over(partition BY d.face_value + t.v) ---- 把相同的总面值的概率加起来
           ,(t.lvl)+1 ,row_number()over(partition BY d.face_value + t.v order by 1)            
       FROM die d,t
      WHERE t.lvl<100 and t.rn=1
          ---- 按照总面值分组聚合,再做下一次递归
      )
select count(*) from(
  SELECT v,sum(p)p 
    FROM t
   WHERE lvl = 100 and rn=1
group by v)
;   

  COUNT(*)
----------
       699

Elapsed: 00:00:26.55