Lesson 12 - xyzen/CS490 GitHub Wiki

Lesson 12: In this lesson, we dived further into machine learning and concepts of model training. We spent a lot of time trying to process on the Raspberry Pi and experienced various issues with compatibility or efficiency as we maneuvered through steps. Ultimately we ended up using a PC to properly obtain results. In the end, we were able to create an audio classification application capable of analyzing against ten different classes of urban audio. We then ported this into a small Flask application to run on a webpage.

Pre-processing

First, we downloaded the UrbanSound8k dataset, and used librosa to load the .wav files and create Mel-Frequency Cepstral Cofficient spectrograms (MFCCSs) for each. When loading files with librosa, they will automatically be loaded with a sample rate of 22.05KHz, normalized bit depths (between -1 and 1), and a single channel (mono). We use librosa's mfcc function to generate MFCCSs for 40 frequencies, and numpy's pad function to standardize the length of each waveform to 174. Therefore, along with the single channel (mono) input, each example has the shape: (40, 174, 1). We then used a pandas dataframe, LabelEncoder, and to_categorical to encode the dataset and labels into numpy arrays to be saved and loaded later for training (x_train, x_test, y_train, y_test, yy, le).

import pandas as pd
import numpy as np
import librosa
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# PREPROCESSING

# extracts and returns MFCC spectrogram
def extract_features(file_name):
    max_pad_len = 174
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        print(mfccs.shape)

    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs

dataset_path = "E:/Documents/CS490/Lesson12/UrbanSound8k/audio/"

metadata = pd.read_csv("E:/Documents/CS490/Lesson12/UrbanSound8k/metadata/UrbanSound8k.csv")

features = []

classes = set()

# extract MFCC spectrogram from all sound files
for ndx, row in metadata.iterrows():
	filename = os.path.join(
		os.path.abspath(dataset_path),
		"fold" + str(row["fold"]),
	    str(row["slice_file_name"])
	    )
	class_label = row["class"]
	classes.add(class_label)
	data = extract_features(filename)
	features.append([data, class_label])

# convert to pandas dataframe
featuresdf = pd.DataFrame(features, columns=['feature', 'class_label'])

print(classes)

X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

print(X.shape)
print(y.shape)

le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

print(yy.shape)

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state=42)

np.save("./preprocessed/x_train.npy", x_train)
np.save("./preprocessed/x_test.npy", x_test)
np.save("./preprocessed/y_train.npy", y_train)
np.save("./preprocessed/y_test.npy", y_test)
np.save("./preprocessed/yy.npy", yy)
np.save("./preprocessed/classes.npy", le.classes_)

Constructing and training the model

Next, we constructed, trained, and tested a CNN on our preprocessed dataset using keras.

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

x_train = np.load("./preprocessed/x_train.npy")
y_train = np.load("./preprocessed/y_train.npy")
x_test = np.load("./preprocessed/x_test.npy")
y_test = np.load("./preprocessed/y_test.npy")
yy = np.load("./preprocessed/yy.npy")
le = LabelEncoder()
le.classes_ = np.load("./preprocessed/classes.npy")

# MODEL

num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

def construct_model():
	# Construct model 
	model = Sequential()
	model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
	model.add(MaxPooling2D(pool_size=2))
	model.add(Dropout(0.2))
	
	model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
	model.add(MaxPooling2D(pool_size=2))
	model.add(Dropout(0.2))
	
	model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
	model.add(MaxPooling2D(pool_size=2))
	model.add(Dropout(0.2))
	
	model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
	model.add(MaxPooling2D(pool_size=2))
	model.add(Dropout(0.2))
	model.add(GlobalAveragePooling2D())
	
	model.add(Dense(num_labels, activation='softmax'))
	
	# Compile the model
	model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') 
	
	# Display model architecture summary 
	model.summary()

	model.save("./model.h5")

	return model


model = construct_model()

# TRAINING

num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

# TESTING

# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Flask application

Finally, we loaded the model into a simple Flask application to allow a user to upload a short .wav file and view the CNN's predicted classification. We used flask-wtf, wtforms, and flask_uploads to handle the UI and filehandling for the application. Once again, we used librosa to load the .wav and generate MFCCSs for the uploaded example before using it as input for the classifier.

from flask import Flask, render_template, url_for
from flask_wtf import FlaskForm
from wtforms import FileField
from flask_uploads import configure_uploads, AUDIO, UploadSet
import os
import numpy as np
import librosa
from tensorflow import keras
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# Load model and label encoder
model = keras.models.load_model("./saved_models/weights.best.basic_cnn.hdf5")
le = LabelEncoder()
le.classes_ = np.load("./saved_models/classes.npy")

# data dims
num_rows = 40
num_columns = 174
num_channels = 1

# CODE FOR FLASK APP

app = Flask(__name__, static_folder=os.path.join(os.getcwd(), "static"))
app.config['SECRET_KEY'] = "clishmaclaver"
app.config['UPLOADED_AUDIOSET_DEST'] = "static"

audioset = UploadSet("audioset", AUDIO)
configure_uploads(app, audioset)

class AudioClassifyForm(FlaskForm):
    audio = FileField("audio")

@app.route("/", methods=["GET", "POST"])
def home():
    form = AudioClassifyForm()
    if form.validate_on_submit():
        filename = audioset.save(form.audio.data)
        filename = os.path.join(os.getcwd(), "static", filename)
        features = extract_features(filename)
        features = features.reshape(40, 174, 1)
        status = print_prediction(filename)
        return render_template("index.html", form=form, status=status)
    return render_template("index.html", form=form, status="")

@app.route("/about")
def about():
    return "A simple Flask project for audio classification"

# CODE FOR AUDIO CLASSIFICATION

# print predictions
def print_prediction(file_name):
    status = []
    prediction_feature = extract_features(file_name) 
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = np.argmax(model.predict(prediction_feature), axis=-1)
    predicted_class = le.inverse_transform(predicted_vector)
    status.append("The predicted class is: " + str(predicted_class[0]))

    predicted_proba_vector = model.predict(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        string = category[0] + ": " + format(predicted_proba[i], '.32f')
        status.append(string)
    return status

# extracts and returns MFCC spectrogram
def extract_features(file_name):
    max_pad_len = 174
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        print(mfccs.shape)

    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        print(e)
        return None 
     
    return mfccs

if __name__ == "__main__":
	app.run()

Uploading audio

uploading audio

Classification example

classification example

Link to the Code

Link to Youtube Video

Things Learned

In lesson 12 we learned about various Convolutional Neural Networks. Specifically we learned how to train a CNN on audio spectrograms. We learned how to use the librosa python library for standardizing, normalizing and producing spectorgram from audio waveforms as inputs for machine learning models. In the end, we also learned a lot about compatibility between edge devices and certain machine learning libraries as well as what kind of efficiency to expect from a less powerful device.