GCS BigQuery ML - ghdrako/doc_snipets GitHub Wiki
ML model in BigQuery
-
https://cloud.google.com/bigquery-ml/docs/customer-managed-encryption-key
-
Label = alias a column as 'label' or specify column in OPTIONS using imput_label_cols
-
Feature = passes through to the model as part of your SQL SELECT statement SELECT * FROM ML.FEATURE_INFO(MODEL
mydataset.mymodel
) -
Model = an object created in BigQuery that resides in your BigQuery dataset
- Linear
- Regresion
- Logistic Regression
CREATE OR REPLACE MODEL <dataset>.<name>
OPTIONS(model_type='<type') AS
<training dataset>
-
Training process =
SELECT * FROM ML.TRAINING_INFO(MODEL `mydataset.mymodel`)
-
Inspect Weight =
SELECT * FROM ML.WEIGHTS(MODEL `mydataset.mymodel`, (<query>))
-
Evaluation =
SELECT * FROM ML.EVALUATE(MODEL `mydataset.mymodel`)
-
Prediction =
SELECT * FROM ML.PREDICT(MODEL `mydataset.mymodel`, (<query>))
- Prepare Dataset
- Create/train
CREATE MODEL `mydataset.mymodel`
OPTIONS(model_type='<type') AS
SELECT
- Evaluate
SELECT * FROM ML.EVALUATE(MODEL `mydataset.mymodel`)
- Predict/classify
SELECT * FROM ML.PREDICT(MODEL `mydataset.mymodel`, (<query>))
BQML supports the two listed above.
There are the three model types to choose from:
Model Model Type Label Data type Example
Forecasting linear_reg Numeric value (typically an integer or floating point) Forecast sales figures for next year given historical sales data. Classification logistic_reg 0 or 1 for binary classification Classify an email as spam or not spam given the context.
Forecasting numeric values like next month's sales with Linear Regression (linear_reg). Binary or Multiclass Classification like spam or not spam email by using Logistic Regression (logistic_reg). k-Means Clustering for when you want unsupervised learning for exploration (kmeans).
CREATE OR REPLACE MODEL ecommerce.classification_model
OPTIONS
(
model_type='logistic_reg',
labels = ['will_buy_on_return_visit']
)
AS
#standardSQL SELECT
- EXCEPT(fullVisitorId) FROM
(SELECT
fullVisitorId,
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site
FROM
data-to-insights.ecommerce.web_analytics
WHERE
totals.newVisits = 1
AND date BETWEEN '20160801' AND '20170430') # train on first 9 months
JOIN
(SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid)
USING (fullVisitorId)
;
n BQML, roc_auc is simply a queryable field when evaluating your trained ML model.
Now that training is complete, run this query to evaluate how well the model performs using ML.EVALUATE:
SELECT roc_auc, CASE WHEN roc_auc > .9 THEN 'good' WHEN roc_auc > .8 THEN 'fair' WHEN roc_auc > .7 THEN 'decent' WHEN roc_auc > .6 THEN 'not great' ELSE 'poor' END AS model_quality FROM ML.EVALUATE(MODEL ecommerce.classification_model, (
SELECT
- EXCEPT(fullVisitorId) FROM
(SELECT
fullVisitorId,
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site
FROM
data-to-insights.ecommerce.web_analytics
WHERE
totals.newVisits = 1
AND date BETWEEN '20170501' AND '20170630') # eval on 2 months
JOIN
(SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM
data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid)
USING (fullVisitorId)
));
You should see the following result:
Row roc_auc model_quality
1 0.724588 decent
After evaluating your model you get a roc_auc of 0.72, which shows the model has decent, but not great, predictive power. Since the goal is to get the area under the curve as close to 1.0 as possible, there is room for improvement.
CREATE OR REPLACE MODEL ecommerce.classification_model_2
OPTIONS
(model_type='logistic_reg', labels = ['will_buy_on_return_visit']) AS
WITH all_visitor_stats AS (
SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid
)
SELECT * EXCEPT(unique_session_id) FROM (
SELECT CONCAT(fullvisitorid, CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,
# behavior on the site
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site,
IFNULL(totals.pageviews, 0) AS pageviews,
# where the visitor came from
trafficSource.source,
trafficSource.medium,
channelGrouping,
# mobile or desktop
device.deviceCategory,
# geographic
IFNULL(geoNetwork.country, "") AS country
FROM data-to-insights.ecommerce.web_analytics
,
UNNEST(hits) AS h
JOIN all_visitor_stats USING(fullvisitorid)
WHERE 1=1 # only predict for new visits AND totals.newVisits = 1 AND date BETWEEN '20160801' AND '20170430' # train 9 months
GROUP BY unique_session_id, will_buy_on_return_visit, bounces, time_on_site, totals.pageviews, trafficSource.source, trafficSource.medium, channelGrouping, device.deviceCategory, country );
Evaluate this new model to see if there is better predictive power:
#standardSQL SELECT roc_auc, CASE WHEN roc_auc > .9 THEN 'good' WHEN roc_auc > .8 THEN 'fair' WHEN roc_auc > .7 THEN 'decent' WHEN roc_auc > .6 THEN 'not great' ELSE 'poor' END AS model_quality FROM ML.EVALUATE(MODEL ecommerce.classification_model_2, (
WITH all_visitor_stats AS (
SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid
)
SELECT * EXCEPT(unique_session_id) FROM (
SELECT CONCAT(fullvisitorid, CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,
# behavior on the site
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site,
totals.pageviews,
# where the visitor came from
trafficSource.source,
trafficSource.medium,
channelGrouping,
# mobile or desktop
device.deviceCategory,
# geographic
IFNULL(geoNetwork.country, "") AS country
FROM data-to-insights.ecommerce.web_analytics
,
UNNEST(hits) AS h
JOIN all_visitor_stats USING(fullvisitorid)
WHERE 1=1 # only predict for new visits AND totals.newVisits = 1 AND date BETWEEN '20170501' AND '20170630' # eval 2 months
GROUP BY unique_session_id, will_buy_on_return_visit, bounces, time_on_site, totals.pageviews, trafficSource.source, trafficSource.medium, channelGrouping, device.deviceCategory, country ) ));
Predict which new visitors will come back and purchase
SELECT
*
FROM
ml.PREDICT(MODEL ecommerce.classification_model_2
,
(
WITH all_visitor_stats AS (
SELECT
fullvisitorid,
IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit
FROM data-to-insights.ecommerce.web_analytics
GROUP BY fullvisitorid
)
SELECT CONCAT(fullvisitorid, '-',CAST(visitId AS STRING)) AS unique_session_id,
# labels
will_buy_on_return_visit,
MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,
# behavior on the site
IFNULL(totals.bounces, 0) AS bounces,
IFNULL(totals.timeOnSite, 0) AS time_on_site,
totals.pageviews,
# where the visitor came from
trafficSource.source,
trafficSource.medium,
channelGrouping,
# mobile or desktop
device.deviceCategory,
# geographic
IFNULL(geoNetwork.country, "") AS country
FROM data-to-insights.ecommerce.web_analytics
,
UNNEST(hits) AS h
JOIN all_visitor_stats USING(fullvisitorid)
WHERE # only predict for new visits totals.newVisits = 1 AND date BETWEEN '20170701' AND '20170801' # test 1 month
GROUP BY unique_session_id, will_buy_on_return_visit, bounces, time_on_site, totals.pageviews, trafficSource.source, trafficSource.medium, channelGrouping, device.deviceCategory, country )
)
ORDER BY predicted_will_buy_on_return_visit DESC;