DenseNet121_Chexpert_BCE_E5_B32_C1_N5

Version: 1

Trained DenseNet121 architecture using the 'Chexpert_BCE_E5_B32_C1_N5' benchmark. The benchmark was initialized for the chexpert_preprocessed-256-crop dataset with batch size of 32, shuffle set to True and images rescaled to dimension (256, 256). The training was done for 5 epochs using the Adam optimizer and binary_crossentropy loss. A total of 5 labels/pathologies were included in the training and encoded using the 'uzeroes' method. The traing set included 61164 number of sample, the validation set 15207, and the test set 19047.

from pathlib import Path
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import json
import os
import re
import pprint

basepath = Path(os.getcwd())
if basepath.name != "idp-radio-1":
    os.chdir(basepath.parent.parent)
    print(os.getcwd())
load_dotenv(find_dotenv())

from src.preprocessing.split.train_test_split import train_test_split

/srv/idp-radio-1

data = json.loads(os.environ['EXP_DATA'])
history = data['history']

Model and Benchmark Summary

for s in data["description"].split(".")[:-1]:
    print(s + ".\n")

Trained DenseNet121 architecture using the 'Chexpert_BCE_E5_B32_C1_N5' benchmark.

 The benchmark was initialized for the chexpert_preprocessed-256-crop dataset with batch size of 32, shuffle set to True and images rescaled to dimension (256, 256).


The training was done for 5 epochs using the Adam optimizer and binary_crossentropy loss.


A total of 5 labels/pathologies were included in the training and encoded using the 'uzeroes' method.


The traing set included 61164 number of sample, the validation set 15207, and the test set 19047.

Extract and format metrics to be plotted

# if there are any metrics that were renamed, add this new name here as ("default_name":"new_name")
metric_custom_names={"auc":"AUC_ROC"}

metric_names = [re.sub("([a-z0-9])([A-Z])","\g<1> \g<2>",name) for name in data["benchmark"]["metrics"]]
metric_keys = [re.sub("([a-z0-9])([A-Z])","\g<1>_\g<2>",name).lower() for name in data["benchmark"]["metrics"]]

for default_name, custom_name in metric_custom_names.items():
    if not default_name in history.keys() and default_name in metric_keys:
        #replace default name with custom name
        metric_keys[metric_keys.index(default_name)]=custom_name

Plot training & validation accuracy values

def print_or_plot_metric(metric_key, metric_name, figure_name):
    if len(history[metric_key]) == 1:
        print("Data for {m_name} only available for a single epoch. \nSkipping plot and printing data...".format(m_name=metric_name))
        print('Train {}: '.format(metric_name), history[metric_key])
        print('Validation {}: '.format(metric_name), history['val_'+metric_key])
        print()        
    else:
        plot_epoch_metric(metric_key, metric_name, figure_name)
        
def plot_epoch_metric(metric_key, metric_name, figure_name):
    figure(num=None, figsize=(10, 6))
    plt.plot(history[metric_key])
    if 'val_'+metric_key in history.keys():
        plt.plot(history['val_'+metric_key])
    plt.title(figure_name)
    plt.ylabel(metric_name)
    plt.xlabel('Epoch')
    if 'val_'+metric_key in history.keys():
        plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

for i, metric_key in enumerate(metric_keys):
    print_or_plot_metric(metric_key, metric_names[i], "Model "+metric_names[i])

png

Plot training & validation loss values

print_or_plot_metric("loss", "Loss", "Model loss")

png

if "lr" in history.keys():
    plot_epoch_metric("lr", "Learning Rate", "Learning Rate")

png

Classification Report

if 'classification_report' in data.keys() and data['classification_report']:
    print(data['classification_report'])

               precision    recall  f1-score   support

        Edema       0.26      0.06      0.09      5201
  Atelectasis       0.00      0.00      0.00      2747
    Pneumonia       0.00      0.00      0.00       788
 Lung Opacity       0.91      1.00      0.96     17423
Consolidation       0.00      0.00      0.00      2717

    micro avg       0.88      0.61      0.72     28876
    macro avg       0.23      0.21      0.21     28876
 weighted avg       0.60      0.61      0.59     28876
  samples avg       0.90      0.70      0.76     28876

Test Scores

if 'test' in data.keys() and data['test']:
    for score_name, score in data["test"].items():
        print('Test {}: '.format(score_name), score)

Test loss:  0.36962762475013733
Test auc:  0.6292253732681274
Test precision:  0.8973305225372314
Test recall:  0.6274414658546448
Test f2_score:  0.6676000952720642
Test binary_accuracy:  0.8652127385139465

Benchmark Details

pp = pprint.PrettyPrinter(indent=4)
if "benchmark" in data.keys():
    pp.pprint(data["benchmark"])

{   'batch_size': 32,
    'benchmark_name': 'Chexpert_BCE_E5_B32_C1_N5',
    'crop': False,
    'dataset_folder': 'data/chexpert/preprocessed-256-crop',
    'dataset_name': 'chexpert_preprocessed-256-crop',
    'dim': [256, 256],
    'drop_last': True,
    'epochs': 5,
    'label_columns': [   'Edema',
                         'Atelectasis',
                         'Pneumonia',
                         'Lung Opacity',
                         'Consolidation'],
    'loss': 'binary_crossentropy',
    'metrics': ['auc', 'precision', 'recall', 'f2_score', 'binary_accuracy'],
    'models_dir': 'models',
    'n_channels': 3,
    'nan_replacement': 0,
    'negative_weights': [   1.3720439672470093,
                            1.1646225452423096,
                            1.0462990999221802,
                            11.449541091918945,
                            1.1718534231185913],
    'optimizer': 'Adam',
    'path_column': 'Path',
    'path_column_prefix': '',
    'positive_weights': [   3.687854290008545,
                            7.074502468109131,
                            22.598669052124023,
                            1.0956979990005493,
                            6.818911552429199],
    'shuffle': True,
    'split_seed': 6122156,
    'test_num_samples': 19047,
    'train_num_samples': 61164,
    'u_enc': 'uzeroes',
    'unc_value': -1,
    'use_class_weights': False,
    'valid_num_samples': 15207}

Data Distribution

if 'benchmark' in data.keys() and 'split_seed' in data['benchmark']:
    benchmark = data['benchmark']

    dataset_path = Path(benchmark['dataset_folder'])
    train_labels = benchmark['train_labels'] if 'train_labels' in benchmark.keys() else 'train.csv'
    split_test_size =  benchmark['split_test_size'] if 'split_test_size' in benchmark.keys() else 0.2
    split_valid_size =  benchmark['split_valid_size'] if 'split_valid_size' in benchmark.keys() else 0.2
    split_group = benchmark['split_group'] if 'split_group' in benchmark.keys() else 'patient_id'
    split_seed = benchmark['split_seed']

    all_labels = pd.read_csv(dataset_path / train_labels)
    train_labels, test_labels = train_test_split(all_labels, test_size=split_test_size, group=split_group, seed=split_seed)
    train_labels, validation_labels = train_test_split(train_labels, test_size=split_valid_size, group=split_group, seed=split_seed)

from src.datasets.u_encoding import uencode

def get_distribution(labels):
    if 'nan_replacement' in benchmark.keys():
        labels = labels.fillna(benchmark['nan_replacement'])
    data = labels.to_numpy()
    data = uencode(benchmark['u_enc'], data, unc_value=benchmark['unc_value'])
    data = pd.DataFrame(data, columns=labels.columns)

    labels = data[benchmark['label_columns']]

    d = {'Pathology': [], 'Positive': [], 'Positive %': [], 'Negative': [], 'Negative %': [],}
    for label in labels.columns:
        values = labels.groupby(label)
        d['Pathology'].append(label)

        positive = values.size()[1.0] if 1.0 in values.size() else 0
        positive_percent = positive / labels.shape[0] * 100
        d['Positive'].append(positive)
        d['Positive %'].append(round(positive_percent))

        negative = values.size()[-0.0] if -0.0 in values.size() else 0
        negative_percent = negative / labels.shape[0] * 100
        d['Negative'].append(negative)
        d['Negative %'].append(round(negative_percent))
    
    df = pd.DataFrame(d)
    df = df.set_index('Pathology')

    return df

if 'benchmark' in data.keys() and 'split_seed' in data['benchmark']:
    train = get_distribution(train_labels)
    val = get_distribution(validation_labels)
    test = get_distribution(test_labels)
    
    positives = train[['Positive %']].merge(val[['Positive %']], left_index=True, right_index=True).merge(test[['Positive %']], left_index=True,  right_index=True).rename(columns={"Positive %_x": "Positives Train", "Positive %_y": "Positives Validation", "Positive %": "Positives Test", })
    positives.copy().plot(kind='bar', figsize=(10,7), title="Positive Labels Distribution")
    
    negatives = train[['Negative %']].merge(val[['Negative %']], left_index=True, right_index=True).merge(test[['Negative %']], left_index=True,  right_index=True).rename(columns={"Negative %_x": "Negative Train", "Negative %_y": "Negative Validation", "Negative %": "Negative Test", })
    negatives.copy().plot(kind='bar', figsize=(10,7), title="Negative Labels Distribution")

    train[['Positive %', 'Negative %']].copy().plot(kind='bar', figsize=(10,7), title="Training set")
    val[['Positive %', 'Negative %']].copy().plot(kind='bar', figsize=(10,7), title="Validation set")
    test[['Positive %', 'Negative %']].copy().plot(kind='bar', figsize=(10,7), title="Test set")