Simulating mortality data - sparklabnyc/resources GitHub Wiki

It's useful to be able to check your model against simulated data for 2 reasons:

  1. Deaths are identifiable and cannot be shared around
  2. We want to check that our model can recover the parameters from our simulation

Below is some example R code on how to simulate deaths for a single county with variation over age and time.

library(tidyverse)

set.seed(1)

years <- 1990:2020
populations <- list()
for (y in years) {
  print(y)
  population <- read_csv(str_c("https://raw.githubusercontent.com/rmp15/cdc_population_monthly_infer/main/output/population_processed/5_year_age_groups/vintage_2020/pop_monthly_5_year_age_groups_", y, ".csv")) |>
    filter(month == 6) # true midyear population (June) only
  
  populations[[str_c(y)]] <- population
}
populations <- bind_rows(populations)

# for simplicity, take only one sex and only one county
# Let's choose LA county ()
# (national would not be sparse enough)
populations <- populations |>
  filter(sex == 1) |>
  filter(fips == "36047")

# age terms taken (roughly) from UK study (Bennett et al. 2015)
age_effect <- tibble(
  age = unique(population$age),
  age_effect = -5.5 + c(-1.0,-3,-2.9,-1.8,-1.7,-1.7,-1.6,-1.4,-.8,-.2,.3,1.0,1.5,1.9,2.4,2.8,3.4,3.8)
)

# overall downwards slope + iid for each year
time_effect <- tibble(
  year = years,
  time_effect = -0.02 * (seq_along(years) - mean(seq_along(years))) + rnorm(length(years)) * 0.01
)
# iid age-time noise
age_time_effect <- expand_grid(
  age = unique(population$age),
  year = years
)
age_time_effect$age_time_effect <- rnorm(dim(age_time_effect)[1]) * 0.005

populations <- populations |>
  left_join(age_effect) |>
  left_join(time_effect) |>
  left_join(age_time_effect) |>
  # expit to get death rate
  mutate(death_rate = inv.logit(age_effect + time_effect + age_time_effect))

# simulate binomial death rate
populations$deaths <-  rbinom(n = populations$pop, size = populations$pop, prob = populations$death_rate)

populations <- populations |>
  select(-age_effect, -time_effect, -age_time_effect, -death_rate)

write_csv(populations, "simulated_deaths.csv")

And here is some code to simulate for multiple counties, years, ages and races.

library(tidyverse)

set.seed(1)

years <- 1999:2020
populations <- list()
for (y in years) {
  print(y)
  population <- read_csv(str_c("https://raw.githubusercontent.com/rmp15/cdc_population_monthly_infer/main/output/population_processed/5_year_age_groups/vintage_2020/pop_monthly_5_year_age_groups_", y, ".csv")) |>
    filter(month == 6) # true midyear population (June) only

  populations[[str_c(y)]] <- population
}
populations <- bind_rows(populations)

# for simplicity, take only one sex
populations <- populations |>
  filter(sex == 1) |>
  select(-month)

# add races
grid <- expand_grid(
  age = unique(populations$age),
  fips = unique(populations$fips),
  year = years,
  race = 1:4 # 4 imaginary races
) |>
  # simulation assumes the population is the same for each race
  left_join(populations)

grid <- grid |> mutate(state = substr(fips, 1, 2))

# for simplicity, take only four states
grid <- grid |> filter(state %in% c("01", "12", "17", "36"))

# age terms taken (roughly) from UK study (Bennett et al. 2015)
age_effect <- tibble(
  age = unique(population$age),
  age_effect = -5.5 + c(-1.0,-3,-2.9,-1.8,-1.7,-1.7,-1.6,-1.4,-.8,-.2,.3,1.0,1.5,1.9,2.4,2.8,3.4,3.8)
)

# overall downwards slope + iid for each year
time_effect <- tibble(
  year = years,
  time_effect = -0.02 * (seq_along(years) - mean(seq_along(years))) + rnorm(length(years)) * 0.01
)

# two tier spatial random effect
# 0.01 variance between states
# 0.005 variance between counties
state_effect <- tibble(
  state = unique(grid$state),
  state_effect = rnorm(length(unique(grid$state))) * 0.01
)

county_effect <- tibble(
  fips = unique(grid$fips),
  space_effect = rnorm(length(unique(grid$fips))) * 0.005
)

# race effect
race_effect <- tibble(
  race = 1:4,
  race_effect = 0.01 * c(0.0, 1.0, -1.0, 0.5)
)

# iid age-time-race noise
age_time_race_effect <- expand_grid(
  age = unique(grid$age),
  year = years,
  race = 1:4
)
age_time_race_effect$age_time_race_effect <- rnorm(dim(age_time_race_effect)[1]) * 0.005

grid <- grid |>
  left_join(age_effect) |>
  left_join(time_effect) |>
  left_join(state_effect) |>
  left_join(county_effect) |>
  left_join(race_effect) |>
  left_join(age_time_race_effect) |>
  # expit to get death rate
  mutate(death_rate = inv.logit(age_effect + time_effect + state_effect + space_effect + race_effect + age_time_race_effect))

# simulate binomial death rate
grid$deaths <-  rbinom(n = grid$pop, size = grid$pop, prob = grid$death_rate)

grid <- grid |> select(-contains("_effect"), -death_rate)

grid <- grid |> arrange(fips, year, age, race)

write_csv(grid, "simulated_deaths_race.csv")
⚠️ **GitHub.com Fallback** ⚠️