Configuring Base Parameters for our Notebook Tasks - pathfinder-analytics-uk/dab_project GitHub Wiki

Links and Resources


Project Code

resources/citibike_etl_pipeline_nb.job.yml

resources:
  jobs:
    citibike_etl_pipeline_nb:
      name: citibike_etl_pipeline_nb
      tasks:
        - task_key: 01_bronze_citibike
          notebook_task:
            notebook_path: ../citibike_etl/notebooks/01_bronze/01_bronze_citibike.ipynb
            base_parameters:
              pipeline_id: "{{job.id}}"
              run_id: "{{job.run_id}}"
              task_id: "{{task.run_id}}"
              processed_timestamp: "{{job.start_time.iso_datetime}}"
              catalog: "${var.catalog}"
            source: WORKSPACE
          job_cluster_key: citibike_etl_pipeline_nb_cluster
        - task_key: 02_silver_citibike
          depends_on:
            - task_key: 01_bronze_citibike
          notebook_task:
            notebook_path: ../citibike_etl/notebooks/02_silver/02_silver_citibike.ipynb
            base_parameters:
              pipeline_id: "{{job.id}}"
              run_id: "{{job.run_id}}"
              task_id: "{{task.run_id}}"
              processed_timestamp: "{{job.start_time.iso_datetime}}"
              catalog: "${var.catalog}"
            source: WORKSPACE
          job_cluster_key: citibike_etl_pipeline_nb_cluster
        - task_key: 03_gold_citibike_daily_ride_summary
          depends_on:
            - task_key: 02_silver_citibike
          notebook_task:
            notebook_path: ../citibike_etl/notebooks/03_gold/03_gold_citibike_daily_ride_summary.ipynb
            source: WORKSPACE
          job_cluster_key: citibike_etl_pipeline_nb_cluster
        - task_key: 03_gold_citibike_daily_station_performance
          depends_on:
            - task_key: 02_silver_citibike
          notebook_task:
            notebook_path: ../citibike_etl/notebooks/03_gold/03_gold_citibike_daily_station_performance.ipynb
            source: WORKSPACE
          job_cluster_key: citibike_etl_pipeline_nb_cluster
      job_clusters:
        - job_cluster_key: citibike_etl_pipeline_nb_cluster
          new_cluster:
            cluster_name: ""
            spark_version: 15.4.x-scala2.12
            spark_conf:
              spark.master: local[*, 4]
              spark.databricks.cluster.profile: singleNode
            azure_attributes:
              first_on_demand: 1
              availability: SPOT_WITH_FALLBACK_AZURE
              spot_bid_max_price: -1
            node_type_id: Standard_DS3_v2
            driver_node_type_id: Standard_DS3_v2
            custom_tags:
              ResourceClass: SingleNode
            spark_env_vars:
              PYSPARK_PYTHON: /databricks/python3/bin/python3
            enable_elastic_disk: true
            data_security_mode: SINGLE_USER
            runtime_engine: STANDARD
            num_workers: 0
      queue:
        enabled: true

citibike_etl/notebooks/01_bronze/01_bronze_citibike.ipynb

from pyspark.sql.types import StructType, StructField, StringType, DecimalType, TimestampType
from pyspark.sql.functions import create_map, lit
pipeline_id = dbutils.widgets.get("pipeline_id")
run_id = dbutils.widgets.get("run_id")
task_id = dbutils.widgets.get("task_id")
processed_timestamp = dbutils.widgets.get("processed_timestamp")
catalog = dbutils.widgets.get("catalog")
schema = StructType([
    StructField("ride_id", StringType(), True),
    StructField("rideable_type", StringType(), True),
    StructField("started_at", TimestampType(), True),
    StructField("ended_at", TimestampType(), True),
    StructField("start_station_name", StringType(), True), 
    StructField("start_station_id", StringType(), True),   
    StructField("end_station_name", StringType(), True), 
    StructField("end_station_id", StringType(), True), 
    StructField("start_lat", DecimalType(), True), 
    StructField("start_lng", DecimalType(), True), 
    StructField("end_lat", DecimalType(), True), 
    StructField("end_lng", DecimalType(), True), 
    StructField("member_casual", StringType(), True), 
])
df = spark.read.csv("/Volumes/citibike_dev/00_landing/source_citibike_data/JC-202503-citibike-tripdata.csv", schema=schema, header=True)
df = df.withColumn("metadata", 
              create_map(
                  lit("pipeline_id"), lit(pipeline_id),
                  lit("run_id"), lit(run_id),
                  lit("task_id"), lit(task_id),
                  lit("processed_timestamp"), lit(processed_timestamp)
                  ))
df.write.\
    mode("overwrite").\
    option("overwriteSchema", "true").\
    saveAsTable("citibike_dev.01_bronze.jc_citibike")

citibike_etl/notebooks/02_silver/02_bronze_citibike.ipynb

import os
import sys

current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, "..", "..", ".."))

sys.path.append(project_root)
from citibike.citibike_utils import get_trip_duration_mins
from utils.datetime_utils import timestamp_to_date_col
from pyspark.sql.functions import create_map, lit
pipeline_id = dbutils.widgets.get("pipeline_id")
run_id = dbutils.widgets.get("run_id")
task_id = dbutils.widgets.get("task_id")
processed_timestamp = dbutils.widgets.get("processed_timestamp")
catalog = dbutils.widgets.get("catalog")
df = spark.read.table("citibike_dev.01_bronze.jc_citibike")
df = get_trip_duration_mins(spark, df, "started_at", "ended_at", "trip_duration_mins")
df = timestamp_to_date_col(spark, df, "started_at", "trip_start_date")
df = df.withColumn("metadata", 
              create_map(
                  lit("pipeline_id"), lit(pipeline_id),
                  lit("run_id"), lit(run_id),
                  lit("task_id"), lit(task_id),
                  lit("processed_timestamp"), lit(processed_timestamp)
                  ))
df = df.select(
    "ride_id",
    "trip_start_date",
    "started_at",
    "ended_at",
    "start_station_name",
    "end_station_name",
    "trip_duration_mins",
    "metadata"
    )
df.write.\
    mode("overwrite").\
    option("overwriteSchema", "true").\
    saveAsTable("citibike_dev.02_silver.jc_citibike")