Configuring, Deploying and Running the ETL Job - pathfinder-analytics-uk/dab_project GitHub Wiki
Project Code
resources/citibike_etl_pipeline_nb.job.yml
resources:
jobs:
citibike_etl_pipeline_nb:
name: citibike_etl_pipeline_nb
tasks:
- task_key: 01_bronze_citibike
notebook_task:
notebook_path: ../citibike_etl/notebooks/01_bronze/01_bronze_citibike.ipynb
source: WORKSPACE
job_cluster_key: citibike_etl_pipeline_nb_cluster
- task_key: 02_silver_citibike
depends_on:
- task_key: 01_bronze_citibike
notebook_task:
notebook_path: ../citibike_etl/notebooks/02_silver/02_silver_citibike.ipynb
source: WORKSPACE
job_cluster_key: citibike_etl_pipeline_nb_cluster
- task_key: 03_gold_citibike_daily_ride_summary
depends_on:
- task_key: 02_silver_citibike
notebook_task:
notebook_path: ../citibike_etl/notebooks/03_gold/03_gold_citibike_daily_ride_summary.ipynb
source: WORKSPACE
job_cluster_key: citibike_etl_pipeline_nb_cluster
- task_key: 03_gold_citibike_daily_station_performance
depends_on:
- task_key: 02_silver_citibike
notebook_task:
notebook_path: ../citibike_etl/notebooks/03_gold/03_gold_citibike_daily_station_performance.ipynb
source: WORKSPACE
job_cluster_key: citibike_etl_pipeline_nb_cluster
job_clusters:
- job_cluster_key: citibike_etl_pipeline_nb_cluster
new_cluster:
cluster_name: ""
spark_version: 15.4.x-scala2.12
spark_conf:
spark.master: local[*, 4]
spark.databricks.cluster.profile: singleNode
azure_attributes:
first_on_demand: 1
availability: SPOT_WITH_FALLBACK_AZURE
spot_bid_max_price: -1
node_type_id: Standard_DS3_v2
driver_node_type_id: Standard_DS3_v2
custom_tags:
ResourceClass: SingleNode
spark_env_vars:
PYSPARK_PYTHON: /databricks/python3/bin/python3
enable_elastic_disk: true
data_security_mode: SINGLE_USER
runtime_engine: STANDARD
num_workers: 0
queue:
enabled: true