Simulating mortality data - sparklabnyc/resources GitHub Wiki
It's useful to be able to check your model against simulated data for 2 reasons:
- Deaths are identifiable and cannot be shared around
- We want to check that our model can recover the parameters from our simulation
Below is some example R
code on how to simulate deaths for a single county with variation over age and time.
library(tidyverse)
set.seed(1)
years <- 1990:2020
populations <- list()
for (y in years) {
print(y)
population <- read_csv(str_c("https://raw.githubusercontent.com/rmp15/cdc_population_monthly_infer/main/output/population_processed/5_year_age_groups/vintage_2020/pop_monthly_5_year_age_groups_", y, ".csv")) |>
filter(month == 6) # true midyear population (June) only
populations[[str_c(y)]] <- population
}
populations <- bind_rows(populations)
# for simplicity, take only one sex and only one county
# Let's choose LA county ()
# (national would not be sparse enough)
populations <- populations |>
filter(sex == 1) |>
filter(fips == "36047")
# age terms taken (roughly) from UK study (Bennett et al. 2015)
age_effect <- tibble(
age = unique(population$age),
age_effect = -5.5 + c(-1.0,-3,-2.9,-1.8,-1.7,-1.7,-1.6,-1.4,-.8,-.2,.3,1.0,1.5,1.9,2.4,2.8,3.4,3.8)
)
# overall downwards slope + iid for each year
time_effect <- tibble(
year = years,
time_effect = -0.02 * (seq_along(years) - mean(seq_along(years))) + rnorm(length(years)) * 0.01
)
# iid age-time noise
age_time_effect <- expand_grid(
age = unique(population$age),
year = years
)
age_time_effect$age_time_effect <- rnorm(dim(age_time_effect)[1]) * 0.005
populations <- populations |>
left_join(age_effect) |>
left_join(time_effect) |>
left_join(age_time_effect) |>
# expit to get death rate
mutate(death_rate = inv.logit(age_effect + time_effect + age_time_effect))
# simulate binomial death rate
populations$deaths <- rbinom(n = populations$pop, size = populations$pop, prob = populations$death_rate)
populations <- populations |>
select(-age_effect, -time_effect, -age_time_effect, -death_rate)
write_csv(populations, "simulated_deaths.csv")
And here is some code to simulate for multiple counties, years, ages and races.
library(tidyverse)
set.seed(1)
years <- 1999:2020
populations <- list()
for (y in years) {
print(y)
population <- read_csv(str_c("https://raw.githubusercontent.com/rmp15/cdc_population_monthly_infer/main/output/population_processed/5_year_age_groups/vintage_2020/pop_monthly_5_year_age_groups_", y, ".csv")) |>
filter(month == 6) # true midyear population (June) only
populations[[str_c(y)]] <- population
}
populations <- bind_rows(populations)
# for simplicity, take only one sex
populations <- populations |>
filter(sex == 1) |>
select(-month)
# add races
grid <- expand_grid(
age = unique(populations$age),
fips = unique(populations$fips),
year = years,
race = 1:4 # 4 imaginary races
) |>
# simulation assumes the population is the same for each race
left_join(populations)
grid <- grid |> mutate(state = substr(fips, 1, 2))
# for simplicity, take only four states
grid <- grid |> filter(state %in% c("01", "12", "17", "36"))
# age terms taken (roughly) from UK study (Bennett et al. 2015)
age_effect <- tibble(
age = unique(population$age),
age_effect = -5.5 + c(-1.0,-3,-2.9,-1.8,-1.7,-1.7,-1.6,-1.4,-.8,-.2,.3,1.0,1.5,1.9,2.4,2.8,3.4,3.8)
)
# overall downwards slope + iid for each year
time_effect <- tibble(
year = years,
time_effect = -0.02 * (seq_along(years) - mean(seq_along(years))) + rnorm(length(years)) * 0.01
)
# two tier spatial random effect
# 0.01 variance between states
# 0.005 variance between counties
state_effect <- tibble(
state = unique(grid$state),
state_effect = rnorm(length(unique(grid$state))) * 0.01
)
county_effect <- tibble(
fips = unique(grid$fips),
space_effect = rnorm(length(unique(grid$fips))) * 0.005
)
# race effect
race_effect <- tibble(
race = 1:4,
race_effect = 0.01 * c(0.0, 1.0, -1.0, 0.5)
)
# iid age-time-race noise
age_time_race_effect <- expand_grid(
age = unique(grid$age),
year = years,
race = 1:4
)
age_time_race_effect$age_time_race_effect <- rnorm(dim(age_time_race_effect)[1]) * 0.005
grid <- grid |>
left_join(age_effect) |>
left_join(time_effect) |>
left_join(state_effect) |>
left_join(county_effect) |>
left_join(race_effect) |>
left_join(age_time_race_effect) |>
# expit to get death rate
mutate(death_rate = inv.logit(age_effect + time_effect + state_effect + space_effect + race_effect + age_time_race_effect))
# simulate binomial death rate
grid$deaths <- rbinom(n = grid$pop, size = grid$pop, prob = grid$death_rate)
grid <- grid |> select(-contains("_effect"), -death_rate)
grid <- grid |> arrange(fips, year, age, race)
write_csv(grid, "simulated_deaths_race.csv")