postgres sql with_bucket - ghdrako/doc_snipets GitHub Wiki

Creating Histograms with Postgres

Given the number of buckets and max/min value, width_bucket returns the index for the bucket that a value will fall.

SELECT value, width_bucket(value, 0, 100, 10) AS bucket FROM generate_series(0, 100) AS value;

WITH formatted_data AS (
  SELECT * FROM (VALUES (13), (42), (18), (62), (93), (47), (51), (41), (1)) AS t (value)
), bucket_settings AS (
	SELECT
		5 AS bucket_count,
		null::integer AS min_value, -- can be null or an integer
		null::integer AS max_value -- can be null or an integer
), calculated_bucket_settings AS (
	SELECT
		(SELECT bucket_count FROM bucket_settings) AS bucket_count,
		COALESCE(
			(SELECT min_value FROM bucket_settings),
			(SELECT min(value) FROM formatted_data)
		) AS min_value,
		COALESCE(
			(SELECT max_value FROM bucket_settings),
			(SELECT max(value) + 1 FROM formatted_data)
		) AS max_value
), histogram AS (
  SELECT
     WIDTH_BUCKET(value, min_value, max_value, (SELECT bucket_count FROM bucket_settings)) AS bucket,
     COUNT(value) AS frequency
   FROM formatted_data, calculated_bucket_settings
   GROUP BY 1
   ORDER BY 1
)

SELECT
   bucket,
   frequency,
   CONCAT(
     (min_value + (bucket - 1) * (max_value - min_value) / bucket_count)::INT,
     ' - ',
     (((min_value + bucket * (max_value - min_value) / bucket_count)) - 1)::INT) AS range
FROM histogram, calculated_bucket_settings;

bucket   | frequency |  range
---------+-----------+---------
       1 |         3 | 1 - 18
       3 |         4 | 38 - 55
       4 |         1 | 56 - 74
       5 |         1 | 75 - 93
(4 rows)

all buckets and frequencies are not represented. So, if a value is empty, we need to fill in the frequency with a zero. This is where SQL requires thinking in sets. We can use generate_series to generate all values for the buckets, then join the histogram to all values. Flipping the order of the query around makes it simpler than joining an incomplete set. In the following query, we’ve built out the buckets in the all_buckets CTE, then joined that to the histogram values:

WITH formatted_data AS (
  SELECT * FROM (VALUES (13), (42), (18), (62), (93), (47), (51), (41), (1)) AS t (value)
), bucket_settings AS (
  SELECT
        5 AS bucket_count,
        0::integer AS min_value, -- can be null or an integer
        100::integer AS max_value -- can be null or an integer
), calculated_bucket_settings AS (
	SELECT
	  (SELECT bucket_count FROM bucket_settings) AS bucket_count,
	  COALESCE(
	          (SELECT min_value FROM bucket_settings),
	          (SELECT min(value) FROM formatted_data)
	  ) AS min_value,
	  COALESCE(
	          (SELECT max_value FROM bucket_settings),
	          (SELECT max(value) + 1 FROM formatted_data)
	  ) AS max_value
), histogram AS (
  SELECT
    WIDTH_BUCKET(value, calculated_bucket_settings.min_value, calculated_bucket_settings.max_value + 1, (SELECT bucket_count FROM bucket_settings)) AS bucket,
    COUNT(value) AS frequency
  FROM formatted_data, calculated_bucket_settings
  GROUP BY 1
  ORDER BY 1
 ), all_buckets AS (
  SELECT
    fill_buckets.bucket AS bucket,
    FLOOR(calculated_bucket_settings.min_value + (fill_buckets.bucket - 1) * (calculated_bucket_settings.max_value - calculated_bucket_settings.min_value) / (SELECT bucket_count FROM bucket_settings)) AS min_value,
    FLOOR(calculated_bucket_settings.min_value + fill_buckets.bucket * (calculated_bucket_settings.max_value - calculated_bucket_settings.min_value) / (SELECT bucket_count FROM bucket_settings)) AS max_value
  FROM calculated_bucket_settings,
	  generate_series(1, calculated_bucket_settings.bucket_count) AS fill_buckets (bucket))

 SELECT
   all_buckets.bucket AS bucket,
   CASE
   WHEN all_buckets IS NULL THEN
	   'out of bounds'
	 ELSE
     CONCAT(all_buckets.min_value, ' - ', all_buckets.max_value - 1)
   END AS range,
   SUM(COALESCE(histogram.frequency, 0)) AS frequency
 FROM all_buckets
 FULL OUTER JOIN histogram ON all_buckets.bucket = histogram.bucket
 GROUP BY 1, 2
 ORDER BY bucket;