Day16_SQL분석 23‐25_SQL - bonniekwon0721/Dataanalytics-study GitHub Wiki


빅쿼리에서 날짜 다루기 1

DATE 2017-11-25
DATETIME 2017-11-25T11:10:33
TIMESTAMP UTC 11:10:33 2017-11-25
TIME 11:10:33

Select

DATE(2016, 12, 25) AS date_ymd

DATE(DATETIME ‘2016-12-25 23:59:59’) AS date_dt,

DATE(TIMESTAMP ‘2016-12-25 05:30:00+07’, ‘America/Los_Angeles’ AS date_tstz;


Select

DATETIME(2008, 12, 25, 05, 30, 00) AS datetime_ymd


-- DATE() / DATETIME() / TIMESTAMP()

SELECT
  order_purchase_timestamp,

  DATE(order_purchase_timestamp) as ord_date,

  DATETIME(order_purchase_timestamp) as ord_dt,

  TIMESTAMP(order_purchase_timestamp) as ord_ts,
  TIMESTAMP(DATE(order_purchase_timestamp)) as ord_date_ts,

  TIME(order_purchase_timestamp) as ord_time
FROM `olist.olist_orders`

빅쿼리에서 날짜 다루기 2

  • 날짜에서 일부분만 추출하고 싶다면?
    • EXTRACT(’추출할 부분’ FROM ‘날짜 컬럼’)
추출할 부분 의미 결과(정수)
YEAR 연도 2017
MONTH 11
DAY 25
DAYOFWEEK 요일(0:일요일 ~ 6: 토요일) 1
WEEK 주(기본 일요일 시작) 13
WEEK(MONDAY) 주(월요일 시작) 12
QUARTER 분기 2
-- 필요한 값만 추출하는 EXTRACT

SELECT
  order_purchase_timestamp,
  EXTRACT(YEAR FROM order_purchase_timestamp) AS YEAR,
  EXTRACT(QUARTER FROM order_purchase_timestamp) AS QUARTER,
  EXTRACT(MONTH FROM order_purchase_timestamp) AS MONTH,
  EXTRACT(DAY FROM order_purchase_timestamp) AS DAY,
  EXTRACT(DAYOFWEEK FROM order_purchase_timestamp) AS DAYOFWEEK,
  EXTRACT(WEEK FROM order_purchase_timestamp) AS WEEK,
  EXTRACT(WEEK(MONDAY) FROM order_purchase_timestamp) AS WEEK_MONDAY,
  EXTRACT(HOUR FROM order_purchase_timestamp) AS HOUR,
  EXTRACT(MINUTE FROM order_purchase_timestamp) AS MINUTE,
  EXTRACT(SECOND FROM order_purchase_timestamp) AS SECOND,
FROM `olist.olist_orders`
WHERE
  DATE(order_purchase_timestamp) = '2017-11-26' -- 일요일
;

빅쿼리에서 날짜 다루기 3

  • 날짜에서 원하는 정보까지만 남기고 싶다면?
    • DATE_TRUNC('날짜', '남기는 부분')
    • DATETIME_TRUNC('날짜', '남기는 부분')
    • TIMESTAMP_TRUNC('날짜', '남기는 부분')
남기는 부분 의미 결과(날짜) 예시(2017-11-26 16:13:13 UTC)
YEAR 연도 해당 연도의 첫 날 2017-01-01 00:00:00 UTC
MONTH 해당 연도, 월의 첫 날 2017-11-0100:00:00 UTC
DAY 해당 날짜의 0시 0분 2017-11-26 00:00:00 UTC
WEEK 주(기본 일요일 시작) 해당 주차의 첫 날(일) 2017-11-26 00:00:00 UTC
WEEK(MONDAY) 주(월요일 시작) 해당 주차의 첫 날(월) 2017-11-20 00:00:00 UTC
QUARTER 해당 분기의 첫 날 2017-10-01 00:00:00 UTC

& HOUR, MINUTES, SECOND

-- 원하는 정보까지만 남기는 DATE/DATETIME/TIMESTAMP_TRUNC

SELECT
  order_purchase_timestamp,
  TIMESTAMP_TRUNC(order_purchase_timestamp, YEAR) AS YEAR,
  TIMESTAMP_TRUNC(order_purchase_timestamp, QUARTER) AS QUARTER,
  TIMESTAMP_TRUNC(order_purchase_timestamp, MONTH) AS MONTH,
  TIMESTAMP_TRUNC(order_purchase_timestamp, DAY) AS DAY,
  TIMESTAMP_TRUNC(order_purchase_timestamp, WEEK) AS WEEK,
  TIMESTAMP_TRUNC(order_purchase_timestamp, WEEK(MONDAY)) AS WEEK_MONDAY,
  TIMESTAMP_TRUNC(order_purchase_timestamp, HOUR) AS HOUR,
  TIMESTAMP_TRUNC(order_purchase_timestamp, MINUTE) AS MINUTE,
  TIMESTAMP_TRUNC(order_purchase_timestamp, SECOND) AS SECOND,
FROM `olist.olist_orders`
WHERE
  DATE(order_purchase_timestamp) = '2017-11-26' -- 일요일
;
SELECT
  order_purchase_timestamp,
  EXTRACT(YEAR FROM order_purchase_timestamp) AS YEAR,
  TIMESTAMP_TRUNC(order_purchase_timestamp, YEAR) AS YEAR2,

  EXTRACT(YEAR FROM order_purchase_timestamp) AS YEAR,
  TIMESTAMP_TRUNC(order_purchase_timestamp, MONTH) AS MONTH2,

FROM `olist.olist_orders`
WHERE
  DATE(order_purchase_timestamp) = '2017-11-26' -- 일요일
  • 상세 설명

    Untitled

날짜별로 지표 뽑기

/* 첫 번째 : 주문 1건당 주문금액, 판매상품의 수를 구합니다 */

with tb as (
  select
    item.order_id,
    sum(item.price) as ord_amt,
    count(item.order_item_id) as prd_cnt
  from`olist.olist_order_items` as item
  group by item.order_id
)

/* 두 번째 : 주문정보 테이블에 1번 결과와, 고객 unique ID를 붙입니다 */
, base as (
  select
    ord.order_approved_at,
    DATE(ord.order_approved_at) as ord_date, 
    DATETIME(ord.order_approved_at) as ord_dt,
    TIMESTAMP(ord.order_approved_at) as ord_ts,

    --EXTRACT / TRUNC
    --EXTRACT
    EXTRACT(YEAR FROM ord.order_approved_at) as ord_year,
    EXTRACT(MONTH FROM ord.order_approved_at) as ord_month,
    --TRUNC
    TIMESTAMP_TRUNC(ord.order_approved_at, YEAR) as ord_year_t,
    TIMESTAMP_TRUNC(ord.order_approved_at, MONTH) as ord_month_t,

    ord.order_id,
    ord.customer_id,
    cust.customer_unique_id,
    tb.ord_amt,
    tb.prd_cnt
  from `olist.olist_orders` as ord
  left join `olist.olist_customers` as cust
    on ord.customer_id = cust.customer_id
  inner join tb
    on ord.order_id = tb.order_id
)

/* 세 번째 : 2번 결과를 집계합니다 */
select
  ord_month_t,
  sum(ord_amt) as `총매출`,
  count(distinct order_id) as `총주문수`,
  sum(prd_cnt) as `총판매상품수`,
  sum(ord_amt) / count(distinct order_id) as `주문당평균가격`,
  sum(prd_cnt) / count(distinct order_id) as `평균판매상품수`,
  sum(ord_amt) / sum(prd_cnt) as `제품개당평균가격`,
  count(distinct customer_unique_id) as `주문고객수` ,
  count(distinct order_id) / count(distinct customer_unique_id) as `주문빈도`
from base
GROUP BY ord_month_t
ORDER BY ord_month_t

Appendix. 빅쿼리의 유용한 함수 1

필요 없는 컬럼은 빼고 불러오자

  • SELECT* EXCEPT(제외할 컬럼)

SELECT * EXCEPT (shipping_limit_date, freight_value) FROM olist.olist_order_items``

;

  • SELECT * REPLACE ('대체할 값' AS 컬럼명)

SELECT * REPLACE (price * 10000 AS price) FROM olist.olist_order_items``

SELECT *
FROM `olist.olist_order_items`
LIMIT 100
;

-- EXCEPT
SELECT * EXCEPT (shipping_limit_date, freight_value)
FROM `olist.olist_order_items`
LIMIT 100
;

-- ADD
SELECT *,
  price * 10000 AS price
FROM `olist.olist_order_items`
LIMIT 100
;

-- REPLACE
-- price 에 * 10,000 해주고 싶어요!
SELECT * REPLACE (price * 10000 AS price)
FROM `olist.olist_order_items`
LIMIT 100
;

Appendix. 빅쿼리의 유용한 함수 2

데이터 타입을 바꾸는 CAST 그리고 SAFE_CAST

  • CAST(’값' As '데이터타입’)
  • SAFE_CAST(’값' As ‘데이터타입’)

A CAST ('123' AS INT64) -- INT64 SAFE_CAST ('123' AS INT64) -- INT64 CAST ('abc' AS INT64) -- error! SAFE_CAST ('abc' AS INT64) -- null

오류를 발생시키지 않는 SAFE 연산

SAFE_ADD(X, Y) = X + Y 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함
SAFE_SUBTRACT(X, Y) = X - Y 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함
SAFE_MULTIPLY(X, Y) = X * Y 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함
SAFE_DIVIDE(X, Y) = X / Y 에러 있을 경우 에러를 발생시키지 않고 nu로 반환함
-- CAST & SAFE_CAST
SELECT
  CAST ('12345' AS INT64) AS cast_result,
  SAFE_CAST ('12345' AS INT64) AS safe_result
;

SELECT
  CAST ('asdf' AS INT64) AS cast_result
;

SELECT
  SAFE_CAST ('asdf' AS INT64) AS safe_result
;

-- DIVIDE & SAFE_DIVIDE
SELECT
  10 / 5
;

SELECT
  10 / 0
;

SELECT
  SAFE_DIVIDE(5, 0),
  IFNULL(SAFE_DIVIDE(5, 0), 0)
;

대시보드용 데이터 준비

자주 쓰이는 IN / NOT IN

  • 컬럼에는 A, B, C, D, E 중 하나의 값이 있다고 가정
WHERE '컬럼' IN (A, B, C) -> A, B, C에 해당
WHERE ZE' NOT IN (A , B, C) -> D, E에 해당
특정 카테고리 매출만 뽑아주세요! WHERE 카테고리 IN (A B, G....)
특정 지역에 사는 고객만 뽑아주세요! WHERE 거주지 IN (A B, C...)
특정 판매자의 제품만 뽑아주세요! WHERE 판매자 IN (A B, C....)

SELECT

order_status,

COUNT(DISTINCT order_ud) AS ord_cnt

FROM olist.olist_orders

WHERE order_status IN (’shipped’, ‘delivered’

)

— IN 조건 : 두 개 상태로 한정

group by 1

;

/*
1. IN을 활용해 특정 상태에서만 집계 ('shipped', 'delivered')
2. 일자별 지표
3. +) Round를 활용해서 소수 둘째 자리까지만 표시하고, SAFE 연산 사용
4. 컬럼명 영문으로 대체
*/

/* 첫 번째 : 주문 1건당 주문금액, 판매상품의 수를 구합니다 */

with tb as (
  select
    item.order_id,
    sum(item.price) as ord_amt,
    count(item.order_item_id) as prd_cnt
  from`olist.olist_order_items` as item
  group by item.order_id
)

/* 두 번째 : 주문정보 테이블에 1번 결과와, 고객 unique ID를 붙입니다 */
, base as (
  select

    DATE(ord.order_approved_at) as ord_date, 

    ord.order_id,
    ord.customer_id,
    cust.customer_unique_id,
    tb.ord_amt,
    tb.prd_cnt
  from `olist.olist_orders` as ord
  left join `olist.olist_customers` as cust
    on ord.customer_id = cust.customer_id
  inner join tb
    on ord.order_id = tb.order_id
    WHERE 1=1 -- 이것은 아무 의미도 없고. TRUE를 의미 함.
      AND order_status IN ('delivered', 'shipped')
      AND order_approved_at IS NOT NULL -- NULL 값은 제외하고 가져오겠다.
)

/* 세 번째 : 2번 결과를 집계합니다 */
select
  ord_date,
  ROUND(sum(ord_amt), 2) as ord_amt,
  count(distinct order_id) as ord_cnt,
  sum(prd_cnt) as prd_cnt,
  ROUND(IFNULL(SAFE_DIVIDE(sum(ord_amt), count(distinct order_id)),0), 2) as avg_ord_amt, -- 주문 당 평균 가격
  ROUND(IFNULL(SAFE_DIVIDE(sum(prd_cnt), count(distinct order_id)),0), 2) as avg_prd_cnt, -- 주문 당 평균 판매상품 수
  ROUND(IFNULL(SAFE_DIVIDE(sum(ord_amt), sum(prd_cnt)),0), 2) as avg_price, -- 판매 제품 당 평균 가격
  count(distinct customer_unique_id) as cust_cnt ,
  ROUND(IFNULL(SAFE_DIVIDE(count(distinct order_id), count(distinct customer_unique_id)),0), 2) as cust_freq
from base
GROUP BY ord_date
ORDER BY ord_date

Adhoc 1 - 가장 고객이 많은 지역은?

요청사항 #1

  • 주(state)별 침투율을 확인하고, : 특정 지역의 인구에 비해서 우리의 고객수를 확인하겠다는 뜻
  • 침투율이 특히 낮은 지역을 중심으로 침투율을 개선해보려고 합니다.
  • 이를 위해 주별 주문수와 고객수를 뽑아주세요.
/* 주(state)별 주문수, 고객수를 추출합니다 */
-- 고객 정보 기준
-- 주문 상태는 필터링하지 않음

SELECT
  customer_state,
  COUNT(DISTINCT ord.order_id) AS ord_cnt,
  COUNT(DISTINCT cust.customer_unique_id) AS cust_cnt
FROM `olist.olist_orders` AS ord
LEFT JOIN `olist.olist_customers` AS cust
  ON ord.customer_id = cust.customer_id
GROUP BY customer_state
ORDER BY cust_cnt DESC

Adhoc 2 - 서브 쿼리를 이용한 join

요청사항 #2

  • 주(state)별 주문수 뽑아야 하는데
  • 고객수 Top 2인 상파울루(SP), 리우데자네이루(R) 주만 뽑아주세요 : 앞에서 한 내용에 조건절만 걸면 될 것 같다.
  • (서브쿼리를 이용한 join을 연습해봅시다!)

LEFT JOIN (SELECT customer_id,

customer_unique_id,

customer_state FROM 'olist.olist_customers")

LEFT JOIN (SELECT customer_id, customer_unique_id, customer_state FROM olist.olist_customers' WHERE customer_state in ('SP','RJ'))

/*
주(state)별 주문수 뽑아야 하는데
고객수 Top 2인 상파울루(SP), 리우데자네이루(RJ) 주만 뽑아주세요
조건. 서브쿼리를 이용한 join!
*/

WITH tb AS (
SELECT
  customer_id,
  customer_state,
  customer_unique_id
FROM `olist.olist_customers`
WHERE customer_state IN('SP', 'RJ')
)
SELECT
  DISTINCT customer_state
FROM tb
;

--테이블 조인 전.
SELECT
  customer_id,
  customer_state,
  customer_unique_id
FROM `olist.olist_customers`
WHERE customer_state IN('SP', 'RJ')
;

-- 최종 쿼리.

SELECT
  cust.customer_state,
  ord.order_id
FROM `olist.olist_orders` AS ord
LEFT JOIN (SELECT
            customer_id,
            customer_state,
            customer_unique_id
           FROM `olist.olist_customers`
           WHERE customer_state IN('SP', 'RJ')) As cust
ON ord.customer_id = cust.customer_id
WHERE cust.customer_state IS NOT NULL -- state 가 NULL이 아닌 값만 보겠다.
;

Adhoc 3 - 순위 매기기

요청사항 #3 (현재 2018년이라고 가정)

  • 지역별 광고 캠페인을 진행해보려고 합니다. 고객수를 보고 고객이 많은 지역부터 전략을 짜려고 하는데, '이를 위해 2017년의 도시(ctgy)별 주문수와 고객술, 매출을 뽑아주세요.
    • 조건. 고객수 기준 순위도 붙여주세요 그리고 전체 순위, 주(state) 내에서의 순위 둘 다 보고 싶어요 배송완료(delivered)된 건만 집계해주세요

순위/행 번호를 매기는 여러가지 방법

점수 ROW_NUMBER() RANK() DENSE_RANK()
100점 1 1 1
95점 2 2 2
95점 3 2 2
80점 4 4 3
중복 없는 순위(고유) 동점일 때 중복 순위(공동 등수)
  • 공동 순위자가 있을 때 그 수만큼 건너뛰고 다음 등수를 매김 | 동점일 때 중복 순위(공동 등수)
  • 순위를 연속으로 매김 |

| ROW _NUMBER() RANK()

DENSE RANK() OVER (ORDER BY [순위를 매기는 조건) 컬럼 & 오름차순(ASC),내림차순(DESC)
ROW_NUMBER()
RANK()
DENSE_RANK() OVER (PARTITION BY [구분 기준]
ORDER BY [순위를 매기는 조건) 예시) 전교 석차 / 학급별 석차
--데이터 생성하기
WITH tb AS (
SELECT 'A' AS name,'class1' AS class, 100 AS score
UNION ALL SELECT 'B' AS name, 'class1' AS class, 90 AS score 
UNION ALL SELECT 'C' AS name, 'class1' AS class, 95 AS score
UNION ALL SELECT 'D' AS name, 'class1' AS class, 95 AS score
UNION ALL SELECT 'E' AS name, 'class1' AS class, 80 AS score
UNION ALL SELECT 'F' AS name, 'class2' AS class, 100 AS score
UNION ALL SELECT 'G' AS name, 'class2' AS class, 90 AS score 
UNION ALL SELECT 'H' AS name, 'class2' AS class, 80 AS score
UNION ALL SELECT 'I' AS name, 'class2' AS class, 70 AS score
UNION ALL SELECT 'J' AS name, 'class2' AS class, 80 AS score
)
--본격적 쿼리
SELECT
  *,
  ROW_NUMBER() OVER (PARTITION BY class ORDER BY score DESC) AS col_rownum,-- PARTITION BY 는 특정 기준별로 나눠준다.
  RANK() OVER (PARTITION BY class ORDER BY score DESC) AS col_rank,
  DENSE_RANK() OVER (PARTITION BY class ORDER BY score DESC) AS col_denserank,
FROM tb
;
/*
시별 주문수, 매출, 고객수 구하기 

조건 1. 2017년
조건 2. 고객수 내림차순 기준 순위 제공
조건 3. 순위는 전체 순위, 주(STATE) 기준 순위 둘다 제공할 것 
조건 4. 배송완료(order_status = delivered’) 된 건만 집계
*/

-- customer_state, customer_city, ord_cnt, ord_amt 구하기
WITH tb AS (
  SELECT
    item.order_id,
    SUM(item.price) AS ord_amt
  FROM `olist.olist_order_items` AS item
  GROUP BY item.order_id
)

  SELECT
    cust.customer_state,
    cust.customer_city,
    COUNT(DISTINCT ord.order_id) AS ord_cnt,
    COUNT(DISTINCT cust.customer_unique_id) AS cust_cnt,
    SUM(tb.ord_amt) AS ord_amt
  FROM `olist.olist_orders` AS ord
  INNER JOIN tb -- 첫번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders` 
    ON ord.order_id = tb.order_id
  LEFT JOIN `olist.olist_customers` AS cust -- 두번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders` + `olist.olist_customers` 
    ON ord.customer_id = cust.customer_id 
  WHERE 1=1 -- 의미가 없는 TRUE, 이것을 언급하면 뒤에 AND 로 연결 가능.
    AND EXTRACT(YEAR FROM order_approved_at) = 2017
    AND order_status = 'delivered'
  GROUP BY 1, 2 -- 첫번째열, 두번째열이 집계의 기준
;

-- 최종
WITH tb AS (
  SELECT
    item.order_id,
    SUM(item.price) AS ord_amt
  FROM `olist.olist_order_items` AS item
  GROUP BY item.order_id
)
, base AS (
  SELECT
    cust.customer_state,
    cust.customer_city,
    COUNT(DISTINCT ord.order_id) AS ord_cnt,
    COUNT(DISTINCT cust.customer_unique_id) AS cust_cnt,
    SUM(tb.ord_amt) AS ord_amt
  FROM `olist.olist_orders` AS ord
  INNER JOIN tb -- 첫번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders` 
    ON ord.order_id = tb.order_id
  LEFT JOIN `olist.olist_customers` AS cust -- 두번째 테이블 엮기 `olist.olist_oder_items` + `olist.olist_orders` + `olist.olist_customers` 
    ON ord.customer_id = cust.customer_id 
  WHERE 1=1 -- 의미가 없는 TRUE, 이것을 언급하면 뒤에 AND 로 연결 가능.
    AND EXTRACT(YEAR FROM order_approved_at) = 2017
    AND order_status = 'delivered'
  GROUP BY 1, 2 -- 첫번째열, 두번째열이 집계의 기준
)
SELECT -- 순위 붙이기
  *,
  ROW_NUMBER() OVER (PARTITION BY customer_state ORDER BY cust_cnt DESC) AS cust_rownum,
  RANK() OVER (PARTITION BY customer_state ORDER BY cust_cnt DESC) AS cust_rank,
  DENSE_RANK() OVER (PARTITION BY customer_state ORDER BY cust_cnt DESC) AS cust_denserank,
FROM base
ORDER BY cust_rownum
;


Studied from 제로베이스 데이터 스쿨