Day16_SQL분석 23‐25_SQL - bonniekwon0721/Dataanalytics-study GitHub Wiki
빅쿼리에서 날짜 다루기 1
| DATE | 2017-11-25 |
|---|---|
| DATETIME | 2017-11-25T11:10:33 |
| TIMESTAMP | UTC 11:10:33 2017-11-25 |
| TIME | 11:10:33 |
Select
DATE(2016, 12, 25) AS date_ymd
DATE(DATETIME ‘2016-12-25 23:59:59’) AS date_dt,
DATE(TIMESTAMP ‘2016-12-25 05:30:00+07’, ‘America/Los_Angeles’ AS date_tstz;
Select
DATETIME(2008, 12, 25, 05, 30, 00) AS datetime_ymd
-- DATE() / DATETIME() / TIMESTAMP()
SELECT
order_purchase_timestamp,
DATE(order_purchase_timestamp) as ord_date,
DATETIME(order_purchase_timestamp) as ord_dt,
TIMESTAMP(order_purchase_timestamp) as ord_ts,
TIMESTAMP(DATE(order_purchase_timestamp)) as ord_date_ts,
TIME(order_purchase_timestamp) as ord_time
FROM `olist.olist_orders`
빅쿼리에서 날짜 다루기 2
- 날짜에서 일부분만 추출하고 싶다면?
EXTRACT(’추출할 부분’ FROM ‘날짜 컬럼’)
| 추출할 부분 | 의미 | 결과(정수) |
|---|---|---|
| YEAR | 연도 | 2017 |
| MONTH | 월 | 11 |
| DAY | 일 | 25 |
| DAYOFWEEK | 요일(0:일요일 ~ 6: 토요일) | 1 |
| WEEK | 주(기본 일요일 시작) | 13 |
| WEEK(MONDAY) | 주(월요일 시작) | 12 |
| QUARTER | 분기 | 2 |
-- 필요한 값만 추출하는 EXTRACT
SELECT
order_purchase_timestamp,
EXTRACT(YEAR FROM order_purchase_timestamp) AS YEAR,
EXTRACT(QUARTER FROM order_purchase_timestamp) AS QUARTER,
EXTRACT(MONTH FROM order_purchase_timestamp) AS MONTH,
EXTRACT(DAY FROM order_purchase_timestamp) AS DAY,
EXTRACT(DAYOFWEEK FROM order_purchase_timestamp) AS DAYOFWEEK,
EXTRACT(WEEK FROM order_purchase_timestamp) AS WEEK,
EXTRACT(WEEK(MONDAY) FROM order_purchase_timestamp) AS WEEK_MONDAY,
EXTRACT(HOUR FROM order_purchase_timestamp) AS HOUR,
EXTRACT(MINUTE FROM order_purchase_timestamp) AS MINUTE,
EXTRACT(SECOND FROM order_purchase_timestamp) AS SECOND,
FROM `olist.olist_orders`
WHERE
DATE(order_purchase_timestamp) = '2017-11-26' -- 일요일
;
빅쿼리에서 날짜 다루기 3
- 날짜에서 원하는 정보까지만 남기고 싶다면?
DATE_TRUNC('날짜', '남기는 부분')DATETIME_TRUNC('날짜', '남기는 부분')TIMESTAMP_TRUNC('날짜', '남기는 부분')
| 남기는 부분 | 의미 | 결과(날짜) | 예시(2017-11-26 16:13:13 UTC) |
|---|---|---|---|
| YEAR | 연도 | 해당 연도의 첫 날 | 2017-01-01 00:00:00 UTC |
| MONTH | 월 | 해당 연도, 월의 첫 날 | 2017-11-0100:00:00 UTC |
| DAY | 일 | 해당 날짜의 0시 0분 | 2017-11-26 00:00:00 UTC |
| WEEK | 주(기본 일요일 시작) | 해당 주차의 첫 날(일) | 2017-11-26 00:00:00 UTC |
| WEEK(MONDAY) | 주(월요일 시작) | 해당 주차의 첫 날(월) | 2017-11-20 00:00:00 UTC |
| QUARTER | 분 | 해당 분기의 첫 날 | 2017-10-01 00:00:00 UTC |
& HOUR, MINUTES, SECOND
-- 원하는 정보까지만 남기는 DATE/DATETIME/TIMESTAMP_TRUNC
SELECT
order_purchase_timestamp,
TIMESTAMP_TRUNC(order_purchase_timestamp, YEAR) AS YEAR,
TIMESTAMP_TRUNC(order_purchase_timestamp, QUARTER) AS QUARTER,
TIMESTAMP_TRUNC(order_purchase_timestamp, MONTH) AS MONTH,
TIMESTAMP_TRUNC(order_purchase_timestamp, DAY) AS DAY,
TIMESTAMP_TRUNC(order_purchase_timestamp, WEEK) AS WEEK,
TIMESTAMP_TRUNC(order_purchase_timestamp, WEEK(MONDAY)) AS WEEK_MONDAY,
TIMESTAMP_TRUNC(order_purchase_timestamp, HOUR) AS HOUR,
TIMESTAMP_TRUNC(order_purchase_timestamp, MINUTE) AS MINUTE,
TIMESTAMP_TRUNC(order_purchase_timestamp, SECOND) AS SECOND,
FROM `olist.olist_orders`
WHERE
DATE(order_purchase_timestamp) = '2017-11-26' -- 일요일
;
SELECT
order_purchase_timestamp,
EXTRACT(YEAR FROM order_purchase_timestamp) AS YEAR,
TIMESTAMP_TRUNC(order_purchase_timestamp, YEAR) AS YEAR2,
EXTRACT(YEAR FROM order_purchase_timestamp) AS YEAR,
TIMESTAMP_TRUNC(order_purchase_timestamp, MONTH) AS MONTH2,
FROM `olist.olist_orders`
WHERE
DATE(order_purchase_timestamp) = '2017-11-26' -- 일요일
-
상세 설명

날짜별로 지표 뽑기
/* 첫 번째 : 주문 1건당 주문금액, 판매상품의 수를 구합니다 */
with tb as (
select
item.order_id,
sum(item.price) as ord_amt,
count(item.order_item_id) as prd_cnt
from`olist.olist_order_items` as item
group by item.order_id
)
/* 두 번째 : 주문정보 테이블에 1번 결과와, 고객 unique ID를 붙입니다 */
, base as (
select
ord.order_approved_at,
DATE(ord.order_approved_at) as ord_date,
DATETIME(ord.order_approved_at) as ord_dt,
TIMESTAMP(ord.order_approved_at) as ord_ts,
--EXTRACT / TRUNC
--EXTRACT
EXTRACT(YEAR FROM ord.order_approved_at) as ord_year,
EXTRACT(MONTH FROM ord.order_approved_at) as ord_month,
--TRUNC
TIMESTAMP_TRUNC(ord.order_approved_at, YEAR) as ord_year_t,
TIMESTAMP_TRUNC(ord.order_approved_at, MONTH) as ord_month_t,
ord.order_id,
ord.customer_id,
cust.customer_unique_id,
tb.ord_amt,
tb.prd_cnt
from `olist.olist_orders` as ord
left join `olist.olist_customers` as cust
on ord.customer_id = cust.customer_id
inner join tb
on ord.order_id = tb.order_id
)
/* 세 번째 : 2번 결과를 집계합니다 */
select
ord_month_t,
sum(ord_amt) as `총매출`,
count(distinct order_id) as `총주문수`,
sum(prd_cnt) as `총판매상품수`,
sum(ord_amt) / count(distinct order_id) as `주문당평균가격`,
sum(prd_cnt) / count(distinct order_id) as `평균판매상품수`,
sum(ord_amt) / sum(prd_cnt) as `제품개당평균가격`,
count(distinct customer_unique_id) as `주문고객수` ,
count(distinct order_id) / count(distinct customer_unique_id) as `주문빈도`
from base
GROUP BY ord_month_t
ORDER BY ord_month_t
Appendix. 빅쿼리의 유용한 함수 1
필요 없는 컬럼은 빼고 불러오자
- SELECT* EXCEPT(제외할 컬럼)
SELECT * EXCEPT (shipping_limit_date, freight_value) FROM olist.olist_order_items``
;
- SELECT * REPLACE ('대체할 값' AS 컬럼명)
SELECT * REPLACE (price * 10000 AS price) FROM olist.olist_order_items``
SELECT *
FROM `olist.olist_order_items`
LIMIT 100
;
-- EXCEPT
SELECT * EXCEPT (shipping_limit_date, freight_value)
FROM `olist.olist_order_items`
LIMIT 100
;
-- ADD
SELECT *,
price * 10000 AS price
FROM `olist.olist_order_items`
LIMIT 100
;
-- REPLACE
-- price 에 * 10,000 해주고 싶어요!
SELECT * REPLACE (price * 10000 AS price)
FROM `olist.olist_order_items`
LIMIT 100
;
Appendix. 빅쿼리의 유용한 함수 2
데이터 타입을 바꾸는 CAST 그리고 SAFE_CAST
- CAST(’값' As '데이터타입’)
- SAFE_CAST(’값' As ‘데이터타입’)
A CAST ('123' AS INT64) -- INT64 SAFE_CAST ('123' AS INT64) -- INT64 CAST ('abc' AS INT64) -- error! SAFE_CAST ('abc' AS INT64) -- null
오류를 발생시키지 않는 SAFE 연산
| SAFE_ADD(X, Y) = X + Y | 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함 |
|---|---|
| SAFE_SUBTRACT(X, Y) = X - Y | 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함 |
| SAFE_MULTIPLY(X, Y) = X * Y | 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함 |
| SAFE_DIVIDE(X, Y) = X / Y | 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함 |
-- CAST & SAFE_CAST
SELECT
CAST ('12345' AS INT64) AS cast_result,
SAFE_CAST ('12345' AS INT64) AS safe_result
;
SELECT
CAST ('asdf' AS INT64) AS cast_result
;
SELECT
SAFE_CAST ('asdf' AS INT64) AS safe_result
;
-- DIVIDE & SAFE_DIVIDE
SELECT
10 / 5
;
SELECT
10 / 0
;
SELECT
SAFE_DIVIDE(5, 0),
IFNULL(SAFE_DIVIDE(5, 0), 0)
;
대시보드용 데이터 준비
자주 쓰이는 IN / NOT IN
- 컬럼에는 A, B, C, D, E 중 하나의 값이 있다고 가정
| WHERE '컬럼' IN (A, B, C) | -> A, B, C에 해당 |
|---|---|
| WHERE ZE' NOT IN (A , B, C) | -> D, E에 해당 |
| 특정 카테고리 매출만 뽑아주세요! | WHERE 카테고리 IN (A B, G....) |
|---|---|
| 특정 지역에 사는 고객만 뽑아주세요! | WHERE 거주지 IN (A B, C...) |
| 특정 판매자의 제품만 뽑아주세요! | WHERE 판매자 IN (A B, C....) |
SELECT
order_status,
COUNT(DISTINCT order_ud) AS ord_cnt
FROM olist.olist_orders
WHERE order_status IN (’shipped’, ‘delivered’
)
— IN 조건 : 두 개 상태로 한정
group by 1
;
/*
1. IN을 활용해 특정 상태에서만 집계 ('shipped', 'delivered')
2. 일자별 지표
3. +) Round를 활용해서 소수 둘째 자리까지만 표시하고, SAFE 연산 사용
4. 컬럼명 영문으로 대체
*/
/* 첫 번째 : 주문 1건당 주문금액, 판매상품의 수를 구합니다 */
with tb as (
select
item.order_id,
sum(item.price) as ord_amt,
count(item.order_item_id) as prd_cnt
from`olist.olist_order_items` as item
group by item.order_id
)
/* 두 번째 : 주문정보 테이블에 1번 결과와, 고객 unique ID를 붙입니다 */
, base as (
select
DATE(ord.order_approved_at) as ord_date,
ord.order_id,
ord.customer_id,
cust.customer_unique_id,
tb.ord_amt,
tb.prd_cnt
from `olist.olist_orders` as ord
left join `olist.olist_customers` as cust
on ord.customer_id = cust.customer_id
inner join tb
on ord.order_id = tb.order_id
WHERE 1=1 -- 이것은 아무 의미도 없고. TRUE를 의미 함.
AND order_status IN ('delivered', 'shipped')
AND order_approved_at IS NOT NULL -- NULL 값은 제외하고 가져오겠다.
)
/* 세 번째 : 2번 결과를 집계합니다 */
select
ord_date,
ROUND(sum(ord_amt), 2) as ord_amt,
count(distinct order_id) as ord_cnt,
sum(prd_cnt) as prd_cnt,
ROUND(IFNULL(SAFE_DIVIDE(sum(ord_amt), count(distinct order_id)),0), 2) as avg_ord_amt, -- 주문 당 평균 가격
ROUND(IFNULL(SAFE_DIVIDE(sum(prd_cnt), count(distinct order_id)),0), 2) as avg_prd_cnt, -- 주문 당 평균 판매상품 수
ROUND(IFNULL(SAFE_DIVIDE(sum(ord_amt), sum(prd_cnt)),0), 2) as avg_price, -- 판매 제품 당 평균 가격
count(distinct customer_unique_id) as cust_cnt ,
ROUND(IFNULL(SAFE_DIVIDE(count(distinct order_id), count(distinct customer_unique_id)),0), 2) as cust_freq
from base
GROUP BY ord_date
ORDER BY ord_date
Adhoc 1 - 가장 고객이 많은 지역은?
요청사항 #1
- 주(state)별 침투율을 확인하고, : 특정 지역의 인구에 비해서 우리의 고객수를 확인하겠다는 뜻
- 침투율이 특히 낮은 지역을 중심으로 침투율을 개선해보려고 합니다.
- 이를 위해 주별 주문수와 고객수를 뽑아주세요.
/* 주(state)별 주문수, 고객수를 추출합니다 */
-- 고객 정보 기준
-- 주문 상태는 필터링하지 않음
SELECT
customer_state,
COUNT(DISTINCT ord.order_id) AS ord_cnt,
COUNT(DISTINCT cust.customer_unique_id) AS cust_cnt
FROM `olist.olist_orders` AS ord
LEFT JOIN `olist.olist_customers` AS cust
ON ord.customer_id = cust.customer_id
GROUP BY customer_state
ORDER BY cust_cnt DESC
Adhoc 2 - 서브 쿼리를 이용한 join
요청사항 #2
- 주(state)별 주문수 뽑아야 하는데
- 고객수 Top 2인 상파울루(SP), 리우데자네이루(R) 주만 뽑아주세요 : 앞에서 한 내용에 조건절만 걸면 될 것 같다.
- (서브쿼리를 이용한 join을 연습해봅시다!)
LEFT JOIN (SELECT customer_id,
customer_unique_id,
customer_state FROM 'olist.olist_customers")
LEFT JOIN (SELECT customer_id, customer_unique_id, customer_state FROM olist.olist_customers' WHERE customer_state in ('SP','RJ'))
/*
주(state)별 주문수 뽑아야 하는데
고객수 Top 2인 상파울루(SP), 리우데자네이루(RJ) 주만 뽑아주세요
조건. 서브쿼리를 이용한 join!
*/
WITH tb AS (
SELECT
customer_id,
customer_state,
customer_unique_id
FROM `olist.olist_customers`
WHERE customer_state IN('SP', 'RJ')
)
SELECT
DISTINCT customer_state
FROM tb
;
--테이블 조인 전.
SELECT
customer_id,
customer_state,
customer_unique_id
FROM `olist.olist_customers`
WHERE customer_state IN('SP', 'RJ')
;
-- 최종 쿼리.
SELECT
cust.customer_state,
ord.order_id
FROM `olist.olist_orders` AS ord
LEFT JOIN (SELECT
customer_id,
customer_state,
customer_unique_id
FROM `olist.olist_customers`
WHERE customer_state IN('SP', 'RJ')) As cust
ON ord.customer_id = cust.customer_id
WHERE cust.customer_state IS NOT NULL -- state 가 NULL이 아닌 값만 보겠다.
;
Adhoc 3 - 순위 매기기
요청사항 #3 (현재 2018년이라고 가정)
- 지역별 광고 캠페인을 진행해보려고 합니다.
고객수를 보고 고객이 많은 지역부터 전략을 짜려고 하는데,
'이를 위해 2017년의 도시(ctgy)별 주문수와 고객술, 매출을 뽑아주세요.
- 조건. 고객수 기준 순위도 붙여주세요 그리고 전체 순위, 주(state) 내에서의 순위 둘 다 보고 싶어요 배송완료(delivered)된 건만 집계해주세요
순위/행 번호를 매기는 여러가지 방법
| 점수 | ROW_NUMBER() | RANK() | DENSE_RANK() |
|---|---|---|---|
| 100점 | 1 | 1 | 1 |
| 95점 | 2 | 2 | 2 |
| 95점 | 3 | 2 | 2 |
| 80점 | 4 | 4 | 3 |
| 중복 없는 순위(고유) | 동점일 때 중복 순위(공동 등수) |
- 공동 순위자가 있을 때 그 수만큼 건너뛰고 다음 등수를 매김 | 동점일 때 중복 순위(공동 등수)
- 순위를 연속으로 매김 |
| ROW _NUMBER() RANK()
| DENSE RANK() | OVER (ORDER BY [순위를 매기는 조건) | 컬럼 & 오름차순(ASC),내림차순(DESC) |
|---|---|---|
| ROW_NUMBER() | ||
| RANK() | ||
| DENSE_RANK() | OVER (PARTITION BY [구분 기준] | |
| ORDER BY [순위를 매기는 조건) | 예시) 전교 석차 / 학급별 석차 |
--데이터 생성하기
WITH tb AS (
SELECT 'A' AS name,'class1' AS class, 100 AS score
UNION ALL SELECT 'B' AS name, 'class1' AS class, 90 AS score
UNION ALL SELECT 'C' AS name, 'class1' AS class, 95 AS score
UNION ALL SELECT 'D' AS name, 'class1' AS class, 95 AS score
UNION ALL SELECT 'E' AS name, 'class1' AS class, 80 AS score
UNION ALL SELECT 'F' AS name, 'class2' AS class, 100 AS score
UNION ALL SELECT 'G' AS name, 'class2' AS class, 90 AS score
UNION ALL SELECT 'H' AS name, 'class2' AS class, 80 AS score
UNION ALL SELECT 'I' AS name, 'class2' AS class, 70 AS score
UNION ALL SELECT 'J' AS name, 'class2' AS class, 80 AS score
)
--본격적 쿼리
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY class ORDER BY score DESC) AS col_rownum,-- PARTITION BY 는 특정 기준별로 나눠준다.
RANK() OVER (PARTITION BY class ORDER BY score DESC) AS col_rank,
DENSE_RANK() OVER (PARTITION BY class ORDER BY score DESC) AS col_denserank,
FROM tb
;
/*
시별 주문수, 매출, 고객수 구하기
조건 1. 2017년
조건 2. 고객수 내림차순 기준 순위 제공
조건 3. 순위는 전체 순위, 주(STATE) 기준 순위 둘다 제공할 것
조건 4. 배송완료(order_status = delivered’) 된 건만 집계
*/
-- customer_state, customer_city, ord_cnt, ord_amt 구하기
WITH tb AS (
SELECT
item.order_id,
SUM(item.price) AS ord_amt
FROM `olist.olist_order_items` AS item
GROUP BY item.order_id
)
SELECT
cust.customer_state,
cust.customer_city,
COUNT(DISTINCT ord.order_id) AS ord_cnt,
COUNT(DISTINCT cust.customer_unique_id) AS cust_cnt,
SUM(tb.ord_amt) AS ord_amt
FROM `olist.olist_orders` AS ord
INNER JOIN tb -- 첫번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders`
ON ord.order_id = tb.order_id
LEFT JOIN `olist.olist_customers` AS cust -- 두번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders` + `olist.olist_customers`
ON ord.customer_id = cust.customer_id
WHERE 1=1 -- 의미가 없는 TRUE, 이것을 언급하면 뒤에 AND 로 연결 가능.
AND EXTRACT(YEAR FROM order_approved_at) = 2017
AND order_status = 'delivered'
GROUP BY 1, 2 -- 첫번째열, 두번째열이 집계의 기준
;
-- 최종
WITH tb AS (
SELECT
item.order_id,
SUM(item.price) AS ord_amt
FROM `olist.olist_order_items` AS item
GROUP BY item.order_id
)
, base AS (
SELECT
cust.customer_state,
cust.customer_city,
COUNT(DISTINCT ord.order_id) AS ord_cnt,
COUNT(DISTINCT cust.customer_unique_id) AS cust_cnt,
SUM(tb.ord_amt) AS ord_amt
FROM `olist.olist_orders` AS ord
INNER JOIN tb -- 첫번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders`
ON ord.order_id = tb.order_id
LEFT JOIN `olist.olist_customers` AS cust -- 두번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders` + `olist.olist_customers`
ON ord.customer_id = cust.customer_id
WHERE 1=1 -- 의미가 없는 TRUE, 이것을 언급하면 뒤에 AND 로 연결 가능.
AND EXTRACT(YEAR FROM order_approved_at) = 2017
AND order_status = 'delivered'
GROUP BY 1, 2 -- 첫번째열, 두번째열이 집계의 기준
)
SELECT -- 순위 붙이기
*,
ROW_NUMBER() OVER (PARTITION BY customer_state ORDER BY cust_cnt DESC) AS cust_rownum,
RANK() OVER (PARTITION BY customer_state ORDER BY cust_cnt DESC) AS cust_rank,
DENSE_RANK() OVER (PARTITION BY customer_state ORDER BY cust_cnt DESC) AS cust_denserank,
FROM base
ORDER BY cust_rownum
;
Studied from 제로베이스 데이터 스쿨