Evaluation Declaration Example 1 - NOAA-OWP/wres GitHub Wiki

# This is a test of real, HEFS baseline validation streamflow data verified through WRES 
# where the observations are 24h mean streamflow (QME) provided in a PI-timeseries XML 
# file and the HEFS ensemble forecasts are of 6h instantaneous stream flow (QINE) 
# provided in a gzipped tarball.  The HEFS forecasts are compared against ESP forecasts
# of the same variable allowing for the computation of skill scores; they are also 
# provided in a gzipped tarball.  The evaluation includes temporal rescaling of the 
# forecast data and the computation of a temporal offset (due to time zones) in order to 
# align the forecasts and observations.
label: Example 1

# Observations are provided in a PI-timeseries file as 24h QME.
observed:
  sources: data/abrfcExample/LGNN5_QME.xml
  variable: QME
  type: observations

# The forecasts are HEFS provided in a gzipped tarball of PI-timeseries XML files.
predicted:
  label: HEFS
  sources: data/abrfcExample/LGNN5_hefs_flow_1yr.tgz
  variable: QINE
  type: ensemble forecasts

# The baseline for calculating skill scores are ESP forecasts also provided in a
# gzipped tarball of PI-timeseries XML files. 
baseline:
  label: ESP
  sources: data/abrfcExample/LGNN5_hefs-mefp_flow_1yr.tgz
  variable: QINE
  type: ensemble forecasts

# The measurement unit of the evaluation
unit: CMS

# The evaluation is restricted to pairs that have a lead time of 42h. In general, 
# it may be more useful to evaluate a longer period of lead durations, setting 
# minimum to be "0" hours and maximum to be "120", "240", or even longer depending
# on the objectives of the evaluation. By default, one big pools is created for
# all lead durations. To create one pool for each lead duration, use an explicit
# lead_time_pools
lead_times:
  minimum: 42
  maximum: 42
  unit: hours

# The time scale of the evaluation, which matches that of the observations.
time_scale:
  function: mean
  period: 24
  unit: hours

# Specifies the thresholds for which to perform the evaluation as quantiles of the
# distribution of the "observed" data.
probability_thresholds:
  values: [0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
  operator: greater equal

# The metrics to calculate
metrics:
  - mean error
  - continuous ranked probability skill score
  - relative operating characteristic score
  - mean square error skill score
  - brier skill score
  - relative operating characteristic diagram
  - brier score
  - sample size
  - continuous ranked probability score

# The decimal format to use when writing numeric outputs.
decimal_format: '#0.000000'

# The output formats to write.
output_formats:
  - csv
  - pairs
  - format: png
    orientation: lead threshold  # Plot by lead time, then threshold