GCS BigQuery ML - ghdrako/doc_snipets GitHub Wiki

ML model in BigQuery

Docs:

Model Types

  • Linear
  • Regresion
  • Logistic Regression
   CREATE OR REPLACE MODEL <dataset>.<name>
   OPTIONS(model_type='<type') AS
   <training dataset>

Process

  1. Training process = SELECT * FROM ML.TRAINING_INFO(MODEL `mydataset.mymodel`)

  2. Inspect Weight = SELECT * FROM ML.WEIGHTS(MODEL `mydataset.mymodel`, (<query>))

  3. Evaluation = SELECT * FROM ML.EVALUATE(MODEL `mydataset.mymodel`)

  4. Prediction = SELECT * FROM ML.PREDICT(MODEL `mydataset.mymodel`, (<query>))

Workin with BigQuery ML

  1. Prepare Dataset
  2. Create/train
   CREATE MODEL `mydataset.mymodel`
   OPTIONS(model_type='<type') AS
   SELECT
  1. Evaluate
    SELECT * FROM ML.EVALUATE(MODEL `mydataset.mymodel`)
  1. Predict/classify
    SELECT * FROM ML.PREDICT(MODEL `mydataset.mymodel`, (<query>))

BQML supports the two listed above.

There are the three model types to choose from:

Model Model Type Label Data type Example

Forecasting linear_reg Numeric value (typically an integer or floating point) Forecast sales figures for next year given historical sales data. Classification logistic_reg 0 or 1 for binary classification Classify an email as spam or not spam given the context.

Forecasting numeric values like next month's sales with Linear Regression (linear_reg). Binary or Multiclass Classification like spam or not spam email by using Logistic Regression (logistic_reg). k-Means Clustering for when you want unsupervised learning for exploration (kmeans).

CREATE OR REPLACE MODEL ecommerce.classification_model OPTIONS ( model_type='logistic_reg', labels = ['will_buy_on_return_visit'] ) AS

#standardSQL SELECT

  • EXCEPT(fullVisitorId) FROM

features

(SELECT fullVisitorId, IFNULL(totals.bounces, 0) AS bounces, IFNULL(totals.timeOnSite, 0) AS time_on_site FROM data-to-insights.ecommerce.web_analytics WHERE totals.newVisits = 1 AND date BETWEEN '20160801' AND '20170430') # train on first 9 months JOIN (SELECT fullvisitorid, IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit FROM data-to-insights.ecommerce.web_analytics GROUP BY fullvisitorid) USING (fullVisitorId) ;

n BQML, roc_auc is simply a queryable field when evaluating your trained ML model.

Now that training is complete, run this query to evaluate how well the model performs using ML.EVALUATE:

SELECT roc_auc, CASE WHEN roc_auc > .9 THEN 'good' WHEN roc_auc > .8 THEN 'fair' WHEN roc_auc > .7 THEN 'decent' WHEN roc_auc > .6 THEN 'not great' ELSE 'poor' END AS model_quality FROM ML.EVALUATE(MODEL ecommerce.classification_model, (

SELECT

  • EXCEPT(fullVisitorId) FROM

features

(SELECT fullVisitorId, IFNULL(totals.bounces, 0) AS bounces, IFNULL(totals.timeOnSite, 0) AS time_on_site FROM data-to-insights.ecommerce.web_analytics WHERE totals.newVisits = 1 AND date BETWEEN '20170501' AND '20170630') # eval on 2 months JOIN (SELECT fullvisitorid, IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit FROM data-to-insights.ecommerce.web_analytics GROUP BY fullvisitorid) USING (fullVisitorId)

));

You should see the following result:

Row roc_auc model_quality

1 0.724588 decent

After evaluating your model you get a roc_auc of 0.72, which shows the model has decent, but not great, predictive power. Since the goal is to get the area under the curve as close to 1.0 as possible, there is room for improvement.

CREATE OR REPLACE MODEL ecommerce.classification_model_2 OPTIONS (model_type='logistic_reg', labels = ['will_buy_on_return_visit']) AS

WITH all_visitor_stats AS ( SELECT fullvisitorid, IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit FROM data-to-insights.ecommerce.web_analytics GROUP BY fullvisitorid )

add in new features

SELECT * EXCEPT(unique_session_id) FROM (

SELECT CONCAT(fullvisitorid, CAST(visitId AS STRING)) AS unique_session_id,

  # labels
  will_buy_on_return_visit,

  MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,

  # behavior on the site
  IFNULL(totals.bounces, 0) AS bounces,
  IFNULL(totals.timeOnSite, 0) AS time_on_site,
  IFNULL(totals.pageviews, 0) AS pageviews,

  # where the visitor came from
  trafficSource.source,
  trafficSource.medium,
  channelGrouping,

  # mobile or desktop
  device.deviceCategory,

  # geographic
  IFNULL(geoNetwork.country, "") AS country

FROM data-to-insights.ecommerce.web_analytics, UNNEST(hits) AS h

JOIN all_visitor_stats USING(fullvisitorid)

WHERE 1=1 # only predict for new visits AND totals.newVisits = 1 AND date BETWEEN '20160801' AND '20170430' # train 9 months

GROUP BY unique_session_id, will_buy_on_return_visit, bounces, time_on_site, totals.pageviews, trafficSource.source, trafficSource.medium, channelGrouping, device.deviceCategory, country );

Evaluate this new model to see if there is better predictive power:

#standardSQL SELECT roc_auc, CASE WHEN roc_auc > .9 THEN 'good' WHEN roc_auc > .8 THEN 'fair' WHEN roc_auc > .7 THEN 'decent' WHEN roc_auc > .6 THEN 'not great' ELSE 'poor' END AS model_quality FROM ML.EVALUATE(MODEL ecommerce.classification_model_2, (

WITH all_visitor_stats AS ( SELECT fullvisitorid, IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit FROM data-to-insights.ecommerce.web_analytics GROUP BY fullvisitorid )

add in new features

SELECT * EXCEPT(unique_session_id) FROM (

SELECT CONCAT(fullvisitorid, CAST(visitId AS STRING)) AS unique_session_id,

  # labels
  will_buy_on_return_visit,

  MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,

  # behavior on the site
  IFNULL(totals.bounces, 0) AS bounces,
  IFNULL(totals.timeOnSite, 0) AS time_on_site,
  totals.pageviews,

  # where the visitor came from
  trafficSource.source,
  trafficSource.medium,
  channelGrouping,

  # mobile or desktop
  device.deviceCategory,

  # geographic
  IFNULL(geoNetwork.country, "") AS country

FROM data-to-insights.ecommerce.web_analytics, UNNEST(hits) AS h

JOIN all_visitor_stats USING(fullvisitorid)

WHERE 1=1 # only predict for new visits AND totals.newVisits = 1 AND date BETWEEN '20170501' AND '20170630' # eval 2 months

GROUP BY unique_session_id, will_buy_on_return_visit, bounces, time_on_site, totals.pageviews, trafficSource.source, trafficSource.medium, channelGrouping, device.deviceCategory, country ) ));

Predict which new visitors will come back and purchase

SELECT * FROM ml.PREDICT(MODEL ecommerce.classification_model_2, (

WITH all_visitor_stats AS ( SELECT fullvisitorid, IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit FROM data-to-insights.ecommerce.web_analytics GROUP BY fullvisitorid )

SELECT CONCAT(fullvisitorid, '-',CAST(visitId AS STRING)) AS unique_session_id,

  # labels
  will_buy_on_return_visit,

  MAX(CAST(h.eCommerceAction.action_type AS INT64)) AS latest_ecommerce_progress,

  # behavior on the site
  IFNULL(totals.bounces, 0) AS bounces,
  IFNULL(totals.timeOnSite, 0) AS time_on_site,
  totals.pageviews,

  # where the visitor came from
  trafficSource.source,
  trafficSource.medium,
  channelGrouping,

  # mobile or desktop
  device.deviceCategory,

  # geographic
  IFNULL(geoNetwork.country, "") AS country

FROM data-to-insights.ecommerce.web_analytics, UNNEST(hits) AS h

JOIN all_visitor_stats USING(fullvisitorid)

WHERE # only predict for new visits totals.newVisits = 1 AND date BETWEEN '20170701' AND '20170801' # test 1 month

GROUP BY unique_session_id, will_buy_on_return_visit, bounces, time_on_site, totals.pageviews, trafficSource.source, trafficSource.medium, channelGrouping, device.deviceCategory, country )

)

ORDER BY predicted_will_buy_on_return_visit DESC;


⚠️ **GitHub.com Fallback** ⚠️