postgres sql with_bucket - ghdrako/doc_snipets GitHub Wiki
Creating Histograms with Postgres
Given the number of buckets and max/min value, width_bucket returns the index for the bucket that a value will fall.
SELECT value, width_bucket(value, 0, 100, 10) AS bucket FROM generate_series(0, 100) AS value;
WITH formatted_data AS (
SELECT * FROM (VALUES (13), (42), (18), (62), (93), (47), (51), (41), (1)) AS t (value)
), bucket_settings AS (
SELECT
5 AS bucket_count,
null::integer AS min_value, -- can be null or an integer
null::integer AS max_value -- can be null or an integer
), calculated_bucket_settings AS (
SELECT
(SELECT bucket_count FROM bucket_settings) AS bucket_count,
COALESCE(
(SELECT min_value FROM bucket_settings),
(SELECT min(value) FROM formatted_data)
) AS min_value,
COALESCE(
(SELECT max_value FROM bucket_settings),
(SELECT max(value) + 1 FROM formatted_data)
) AS max_value
), histogram AS (
SELECT
WIDTH_BUCKET(value, min_value, max_value, (SELECT bucket_count FROM bucket_settings)) AS bucket,
COUNT(value) AS frequency
FROM formatted_data, calculated_bucket_settings
GROUP BY 1
ORDER BY 1
)
SELECT
bucket,
frequency,
CONCAT(
(min_value + (bucket - 1) * (max_value - min_value) / bucket_count)::INT,
' - ',
(((min_value + bucket * (max_value - min_value) / bucket_count)) - 1)::INT) AS range
FROM histogram, calculated_bucket_settings;
bucket | frequency | range
---------+-----------+---------
1 | 3 | 1 - 18
3 | 4 | 38 - 55
4 | 1 | 56 - 74
5 | 1 | 75 - 93
(4 rows)
all buckets and frequencies are not represented. So, if a value is empty, we need to fill in the frequency with a zero. This is where SQL requires thinking in sets. We can use generate_series to generate all values for the buckets, then join the histogram to all values. Flipping the order of the query around makes it simpler than joining an incomplete set. In the following query, we’ve built out the buckets in the all_buckets CTE, then joined that to the histogram values:
WITH formatted_data AS (
SELECT * FROM (VALUES (13), (42), (18), (62), (93), (47), (51), (41), (1)) AS t (value)
), bucket_settings AS (
SELECT
5 AS bucket_count,
0::integer AS min_value, -- can be null or an integer
100::integer AS max_value -- can be null or an integer
), calculated_bucket_settings AS (
SELECT
(SELECT bucket_count FROM bucket_settings) AS bucket_count,
COALESCE(
(SELECT min_value FROM bucket_settings),
(SELECT min(value) FROM formatted_data)
) AS min_value,
COALESCE(
(SELECT max_value FROM bucket_settings),
(SELECT max(value) + 1 FROM formatted_data)
) AS max_value
), histogram AS (
SELECT
WIDTH_BUCKET(value, calculated_bucket_settings.min_value, calculated_bucket_settings.max_value + 1, (SELECT bucket_count FROM bucket_settings)) AS bucket,
COUNT(value) AS frequency
FROM formatted_data, calculated_bucket_settings
GROUP BY 1
ORDER BY 1
), all_buckets AS (
SELECT
fill_buckets.bucket AS bucket,
FLOOR(calculated_bucket_settings.min_value + (fill_buckets.bucket - 1) * (calculated_bucket_settings.max_value - calculated_bucket_settings.min_value) / (SELECT bucket_count FROM bucket_settings)) AS min_value,
FLOOR(calculated_bucket_settings.min_value + fill_buckets.bucket * (calculated_bucket_settings.max_value - calculated_bucket_settings.min_value) / (SELECT bucket_count FROM bucket_settings)) AS max_value
FROM calculated_bucket_settings,
generate_series(1, calculated_bucket_settings.bucket_count) AS fill_buckets (bucket))
SELECT
all_buckets.bucket AS bucket,
CASE
WHEN all_buckets IS NULL THEN
'out of bounds'
ELSE
CONCAT(all_buckets.min_value, ' - ', all_buckets.max_value - 1)
END AS range,
SUM(COALESCE(histogram.frequency, 0)) AS frequency
FROM all_buckets
FULL OUTER JOIN histogram ON all_buckets.bucket = histogram.bucket
GROUP BY 1, 2
ORDER BY bucket;