Formatting output from multiple runs - dime-worldbank/Disease-Modelling-SSA GitHub Wiki

For the ICCS paper I used a few functions to format the data to average out metrics from the runs, scale it up to population sizes etc...

Averaging the output from all runs in a given folder (non-spatial model)

def average_output_from_runs_v1(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # store the output in the dataframe
        storage_df = storage_df._append(data)
    # Calculate the average values of each output by each day
    storage_df = storage_df.groupby('time').mean()
    # Scale the outputs to the population size
    storage_df *= 100 / sample_size
    # Calculate the total number of new cases
    storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
    # return the averaged outputs
    return storage_df

Averaging the output from all runs in a given folder (spatial model)

def average_output_from_runs_v3(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # group the estimates from each district by time to get a population level overview
        data = data.groupby('time').sum()
        # calculate the number of new cases
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # store the averaged out metrics in the dataframe
        storage_df = storage_df._append(data)
    # scale estimates to population levels
    storage_df *= 100 / sample_size
    # group averaged out estimates by time
    storage_df = storage_df.groupby('time').mean()
    # return the dataframe
    return storage_df

Calculating the standard deviation in the output from all runs in a given folder (non-spatial model)


def std_output_from_runs_v1(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # store the output in the dataframe
        storage_df = storage_df._append(data)
    # Calculate the standard variation in values of each output by each day
    storage_df = storage_df.groupby('time').std()
    # Scale the outputs to the population size
    storage_df *= 100 / sample_size
    # Calculate the total number of new cases
    storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
    # return the averaged outputs
    return storage_df

Calculating the standard deviation in the output from all runs in a given folder (spatial model)

def std_output_from_runs_v3(filepath, sample_size):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in os.listdir(filepath):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # group the estimates from each district by time to get a population level overview
        data = data.groupby('time').sum()
        # calculate the number of new cases
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # store the averaged out metrics in the dataframe
        storage_df = storage_df._append(data)
    # scale estimates to population levels
    storage_df *= 100 / sample_size
    # calculate standard deviation by time
    storage_df = storage_df.groupby('time').std()
    # return the dataframe
    return storage_df

Calculating the mean and standard deviation in the output for n runs in a given folder (Non-spatial model)

def sample_output_from_runs_v1(filepath, sample_size, number_of_samples):
    # create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in random.sample(os.listdir(filepath), number_of_samples):
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # store the output in the dataframe
        storage_df = storage_df._append(data)
    # Calculate the total number of new cases
    storage_df['new_cases'] = storage_df['metric_new_cases_asympt'] + storage_df['metric_new_cases_sympt']
    # Calculate the average and variation in values of each output by each day
    mean_df = storage_df.groupby('time').mean()
    std_df = storage_df.groupby('time').std()
    # Scale the outputs to the population size
    mean_df *= 100 / sample_size
    std_df *= 100 / sample_size
    # return the outputs

    return mean_df, std_df

Calculating the mean and standard deviation in the output for n runs in a given folder (spatial model)

def sample_output_from_runs_v3(filepath, sample_size, number_of_samples):
# create an empty dataframe to store the output in
    storage_df = pd.DataFrame()
    # Iterate over each file in the filepath
    for file in random.sample(os.listdir(filepath), number_of_samples)::
        # read in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # group the estimates from each district by time to get a population level overview
        data = data.groupby('time').sum()
        # calculate the number of new cases
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # store the averaged out metrics in the dataframe
        storage_df = storage_df._append(data)
    # scale estimates to population levels
    storage_df *= 100 / sample_size
    # calculate mean and standard deviation by time
    mean_df = storage_df.groupby('time').mean()
    std_df = storage_df.groupby('time').std()
    # Scale the outputs
    mean_df *= 100 / sample_size
    std_df *= 100 / sample_size
    # return the dataframe
    
    return mean_df, std_df

Calculating the spatial spread of cases at a specific time from n runs in a given folder (spatial model)


def calculate_has_cases_in_space_with_samples(filepath, timemax, samples):
    # create a dataframe to store the output from
    storage_df = pd.DataFrame()
    # iterate over files 
    for file in random.sample(os.listdir(filepath), samples):
        # load in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # create a dummy variable to store each districts cumulative cases in the country
        data['cumulative_cases'] = [0] * len(data)
        # calculate the new cases in each district
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # create a dummy variable to store which district has cases in the country
        data['has_cases'] = [0] * len(data)
        # iterate over each district
        for district in data.myId.unique():
            # isolate the data to this district
            cases_in_district = data.loc[data['myId'] == district]
            # calculate the cumulative cases
            data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
            # look at which point in time the cumulative number of cases are greater than zero for 
            # this district and store this as 1's and 0's
            data.loc[cases_in_district.index, 'has_cases'] = \
                [1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
        # store this the dataframe
        storage_df = storage_df.append(data)
    # Get a list of district names
    storage_df['district_number'] = [int(dist[2:]) for dist in storage_df.myId.values]
    # filter the dataframe up to a specified time
    storage_df = storage_df.loc[storage_df['time'] <= timemax]
    # average out the estimates from each run
    storage_df = storage_df.groupby('myId').mean()
    # shuffle the districts so the estimates are ordered correctly 1, 2, 3, ..., 60
    storage_df = storage_df.sort_values('district_number')
    # return the dataframe
    return storage_df

Calculating the spatial spread of cases at a specific time for all runs in a given folder (spatial model)


def calculate_has_cases_in_space_with_samples(filepath, timemax, samples):
    # create a dataframe to store the output from
    storage_df = pd.DataFrame()
    # iterate over files 
    for file in os.listdir(filepath):
        # load in the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # create a dummy variable to store each districts cumulative cases in the country
        data['cumulative_cases'] = [0] * len(data)
        # calculate the new cases in each district
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # create a dummy variable to store which district has cases in the country
        data['has_cases'] = [0] * len(data)
        # iterate over each district
        for district in data.myId.unique():
            # isolate the data to this district
            cases_in_district = data.loc[data['myId'] == district]
            # calculate the cumulative cases
            data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
            # look at which point in time the cumulative number of cases are greater than zero for 
            # this district and store this as 1's and 0's
            data.loc[cases_in_district.index, 'has_cases'] = \
                [1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
        # store this the dataframe
        storage_df = storage_df.append(data)
    # Get a list of district names
    storage_df['district_number'] = [int(dist[2:]) for dist in storage_df.myId.values]
    # filter the dataframe up to a specified time
    storage_df = storage_df.loc[storage_df['time'] <= timemax]
    # average out the estimates from each run
    storage_df = storage_df.groupby('myId').mean()
    # shuffle the districts so the estimates are ordered correctly 1, 2, 3, ..., 60
    storage_df = storage_df.sort_values('district_number')
    # return the dataframe
    return storage_df

Calculating the cumulative number of districts with covid cases from n runs in a given folder (spatial model)

def calculate_cumulative_spread_in_districts(filepath, samples):
    # create a dataframe to store the output in
    storageDF = pd.DataFrame()
    # iterate over the selected files
    for file in random.sample(os.listdir(filepath), samples):
        # load the data
        data = pd.read_csv(filepath + file, delimiter='\t')
        # drop the unnamed column
        data = data.drop('Unnamed: 10', axis=1)
        # create a dummy variable to store each districts cumulative cases in the country
        data['cumulative_cases'] = [0] * len(data)
        # calculate the new cases in each district
        data['new_cases'] = data['metric_new_cases_asympt'] + data['metric_new_cases_sympt']
        # create a dummy variable to store which district has cases in the country
        data['has_cases'] = [0] * len(data)
        # iterate over each district
        for district in data.myId.unique():
            # isolate the data to this district
            cases_in_district = data.loc[data['myId'] == district]
            # calculate the cumulative cases
            data.loc[cases_in_district.index, 'cumulative_cases'] = np.cumsum(cases_in_district['new_cases'].values)
            # look at which point in time the cumulative number of cases are greater than zero for 
            # this district and store this as 1's and 0's
            data.loc[cases_in_district.index, 'has_cases'] = \
                [1 if val > 0 else 0 for val in np.cumsum(cases_in_district['new_cases'].values)]
        # store this the dataframe
        storageDF = storageDF.append(data)
    # calculate the number of infected districts by time
    n_infected_districts = storageDF.groupby('time').sum()
    # average this out by the number of samples
    cumulative_n_infected_dist = list(np.divide(n_infected_districts['has_cases'].values, samples))
    # return the list of cumulative districts with Covid cases
    return cumulative_n_infected_dist