Gold Notebook Walkthrough - pathfinder-analytics-uk/dab_project GitHub Wiki

Project Code

citibike_etl/notebooks/03_gold/03_gold_citibike_daily_ride_summary

from pyspark.sql.functions import max, min, avg, count, round
df = spark.read.table("citibike_dev.02_silver.jc_citibike")
df = df.groupBy("trip_start_date").agg(
    round(max("trip_duration_mins"),2).alias("max_trip_duration_mins"),
    round(min("trip_duration_mins"),2).alias("min_trip_duration_mins"),
    round(avg("trip_duration_mins"),2).alias("avg_trip_duration_mins"),
    count("ride_id").alias("total_trips")
)
df.write.\
    mode("overwrite").\
    option("overwriteSchema", "true").\
    saveAsTable("citibike_dev.03_gold.daily_ride_summary")

citibike_etl/notebooks/03_gold/03_gold_citibike_daily_station_performance

from pyspark.sql.functions import avg, count, round
df = spark.read.table("citibike_dev.02_silver.jc_citibike")
df = df.\
    groupBy("trip_start_date", "start_station_name").\
    agg(
    round(avg("trip_duration_mins"),2).alias("avg_trip_duration_mins"),
    count("ride_id").alias("total_trips")
    )
df.write.\
    mode("overwrite").\
    option("overwriteSchema", "true").\
    saveAsTable("citibike_dev.03_gold.daily_station_performance")