Gold Notebook Walkthrough - pathfinder-analytics-uk/dab_project GitHub Wiki
Project Code
citibike_etl/notebooks/03_gold/03_gold_citibike_daily_ride_summary
from pyspark.sql.functions import max, min, avg, count, round
df = spark.read.table("citibike_dev.02_silver.jc_citibike")
df = df.groupBy("trip_start_date").agg(
round(max("trip_duration_mins"),2).alias("max_trip_duration_mins"),
round(min("trip_duration_mins"),2).alias("min_trip_duration_mins"),
round(avg("trip_duration_mins"),2).alias("avg_trip_duration_mins"),
count("ride_id").alias("total_trips")
)
df.write.\
mode("overwrite").\
option("overwriteSchema", "true").\
saveAsTable("citibike_dev.03_gold.daily_ride_summary")
citibike_etl/notebooks/03_gold/03_gold_citibike_daily_station_performance
from pyspark.sql.functions import avg, count, round
df = spark.read.table("citibike_dev.02_silver.jc_citibike")
df = df.\
groupBy("trip_start_date", "start_station_name").\
agg(
round(avg("trip_duration_mins"),2).alias("avg_trip_duration_mins"),
count("ride_id").alias("total_trips")
)
df.write.\
mode("overwrite").\
option("overwriteSchema", "true").\
saveAsTable("citibike_dev.03_gold.daily_station_performance")