6. Support Vector Machine - ZYL-Harry/Machine_Learning_study GitHub Wiki

Basic Introduction

Objective: find a hyperplane in an N-dimensional space that distinctly classifies the data points
Classification: hard margin SVM, soft margin SVM, kernel SVM

Difference

different cost function with the difference of the location of the regularization parameter λ
different hypothesis

Large Margin

More detailed mathematical process can be seen in this link: Mathematical process of the basic Support Vector Machine

Kernel

Part of the objective: define extra features using landmarks(initial features) and kernels(like similarity functions) to learn more complex nonlinear classifiers

Details:

Other kernels:
1.polynomial kernel
2.string kernel
3.chi-square kernel
4.histogram intersection kernel
......

Tip:
Kernels can be applied to linear regression and logistic regression, but the computation will be very slow, and the SVM can run quite well with kernels

SVM parameter

C
1.large C---means: small λ---low bias and high variance
2.small C---means: large λ---high bias and low variance
σ^2
1.large σ^2---means: features vary more smoothly---high bias and low variance
2.small σ^2---means: features vary less smoothly---low bias and high variance
γ(always used in code)

Multi classification

Advantage in application

Exercise by python

The basic functions of SVM in python sklearn.svm.SVC() can be seen in this link: sklearn.svm.SVC

1. Support Vector Machines

1.1 read the dataset

def read_data(path):
    data = loadmat(path)
    X = data['X']   # (51,2)
    y = data['y']   # (51,1)
    return X, y

1.2 visiualize the dataset

def visiualize_data(X, y):
    # print(np.argwhere(y == 1)[:, 0])
    positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
    negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
    plt.figure()
    plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
    plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
    plt.show()

Output for dataset1:
ex6data1

1.3 take the SVM algorithm

def training_data(X1, y1, C, kernel, tol, max_iter, gamma):
    if kernel == 'linear':
        model = svm.SVC(C=C, kernel=kernel, tol=tol, max_iter=max_iter)
    elif kernel == 'rbf':
        model = svm.SVC(C=C, kernel=kernel, gamma=gamma, tol=tol, max_iter=max_iter)
    estimator = model.fit(X1, y1)
    accuracy = model.score(X1, y1)
    print('an instance of estimator is ', estimator)
    print('the mean accuracy is ', accuracy)
    return model

1.4 visiualize the dataset---without kernel(linear kernel)

def find_decision_boundary_linear(model, X, y):
    # original examples
    positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
    negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
    # create the data for plotting
    w = np.matrix(model.coef_)
    b = np.matrix(model.intercept_)
    x_min = 0
    x_max = 4
    x_plot = np.matrix(np.linspace(start=x_min, stop=x_max, num=100)).T
    y_plot = - (w[0, 0] * x_plot + b) / w[0, 1]
    # plot
    plt.figure()
    plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
    plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
    plt.plot(x_plot, y_plot, color='b')
    plt.show()

Output for dataset1 with C=1:
ex6data1_fitting_C1
Output for dataset1 with C=100:
ex6data1_fitting_C100

1.5 main function for dataset1

'''SVM without kernel(linear kernel)'''
# read the dataset
path1 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data1.mat'
X1, y1 = read_data(path1)
# visiualize the datset
visiualize_data(X1, y1)
# take the SVM algorithm
C1 = 100
kernel1 = 'linear'
tol1 = 1e-3
max_iter1 = 100
gamma1 = 0
model1 = training_data(X1, y1, C1, kernel1, tol1, max_iter1, gamma1)
# visiualize the classifier
find_decision_boundary_linear(model1, X1, y1)

1.6 main function for dataset2

'''SVM with Gaussian kernel'''
path2 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data2.mat'
X2, y2 = read_data(path2)
# visiualize the dataset
visiualize_data(X2, y2)
# take the SVM algorithm
C2 = 100
kernel2 = 'rbf'
tol2 = 1e-3
max_iter2 = 1000
sigma2 = 0.1 # the parameter in the gaussian kernel function
gamma2 = 1 / (2 * np.power(sigma2, 2))   # the parameter used in the svm functions when using the gaussian kernel
model2 = training_data(X2, y2, C2, kernel2, tol2, max_iter2, gamma2)
# visiualize the classifier
find_decision_boundary_gaussian(model2, X2, y2)

dataset2:
ex6data2

1.7 viviualize the dataset---Gaussian kernel(rbf kernel)

def find_decision_boundary_gaussian(model, X, y):
    # original examples
    positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
    negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
    # create the data for plotting
    x1_plot = np.linspace(start=X[:, 0].min(), stop=X[:, 0].max(), num=100)
    x2_plot = np.linspace(start=X[:, 1].min(), stop=X[:, 1].max(), num=100)
    x1, x2 = np.meshgrid(x1_plot, x2_plot)
    x1 = np.matrix(x1).T
    x2 = np.matrix(x2).T
    input = np.c_[x1.reshape((x1.shape[0] * x1.shape[1]), 1), x2.reshape((x2.shape[0] * x2.shape[1]), 1)]
    values = np.matrix(model.predict(input)).T
    values_plot = values.reshape(x1.shape)
    # plot
    plt.figure()
    plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
    plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
    plt.contour(x1, x2, values_plot, [0])
    plt.show()

Output for dataset2:
ex6data2_fitting

1.8 main function for dataset3

'''SVM with Gaussian kernel'''
    # read the dataset
    path3 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data3.mat'
    data3 = loadmat(path3)
    X3 = data3['X']
    y3 = data3['y']
    Xval3 = data3['Xval']
    yval3 = data3['yval']
    # visiualize the dataset
    visiualize_data(X3, y3)
    # determine the parameters C and sigmma
    C3, sigma3, best_score = find_best_params(Xval3, yval3)
    print('C_best = ', C3)
    print('sigma_best = ', sigma3)
    # take the svm algorithm
    kernel3 = 'rbf'
    tol3 = 1e-3
    max_iter3 = 1000
    gamma3 = 1 / (2 * np.power(sigma3, 2))
    model3 = training_data(X3, y3, C3, kernel3, tol3, max_iter3, gamma3)
    # visiualize the calssifier
    find_decision_boundary_gaussian(model3, X3, y3)

dataset3:
ex6data3

1.9 find the best parameters for svm

def find_best_params(Xval, yval):
    C_set = np.matrix([0.1, 0.3, 1, 3, 10, 30]).T
    sigma_set = np.matrix([0.1, 0.3, 1, 3, 10, 30]).T
    gamma_set = 1 / (2 * np.power(sigma_set, 2))
    best_score = 0
    best_C = 0
    best_sigma = 0
    for i in range(C_set.shape[0]):
        for j in range(sigma_set.shape[0]):
            model_temp = svm.SVC(C=C_set[i, 0], kernel='rbf', gamma=gamma_set[j, 0], tol=1e-3)
            estimator_temp = model_temp.fit(Xval, yval)
            accuracy_temp = model_temp.score(Xval, yval)
            if accuracy_temp > best_score:
                best_score = accuracy_temp
                best_C = C_set[i, 0]
                best_sigma = sigma_set[j, 0]
    return best_C, best_sigma, best_score

Output for dataset3:
ex6data3_fitting

2. Spam Classification

2.1 training the svm algorithm

def training_data(X, y):
    model = svm.SVC(C=100, kernel='linear')
    estimator = model.fit(X, y)
    accuracy = model.score(X, y)
    print('an instance of estimator is ', estimator)
    print('the mean accuracy is ', accuracy)
    return model

2.2 main function

if __name__ == '__main__':
    # read the dataset
    path1 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/spamTrain.mat'
    data1 = loadmat(path1)
    X = data1['X']
    y = data1['y']
    # train svm algorithm
    model = training_data(X, y)
    # compute the mean accuracy of the test dataset
    path2 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/spamTest.mat'
    data2 = loadmat(path2)
    Xtest = data2['Xtest']
    ytest = data2['ytest']
    test_accuracy = model.score(Xtest, ytest)
    print('the mean accuracy of the test dataset is ', test_accuracy)
    # see which words the classifier thinks are the most predictive of spam
    # find the index and the order of the words
    w = np.matrix(model.coef_).T
    w_sort = np.sort(-w, axis=0)    # sort from largest to smallest
    w_index = np.argsort(-w, axis=0)
    w_new = np.c_[w_index + 1, w_sort]
    # print(w_new[:15, :])
    # get volcabulary list
    path_volcabulary = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/vocab.txt'
    data_volcabulary = pd.read_csv(path_volcabulary, header=None, names=['index', 'volcabulary'], sep='\t')
    # print(data_volcabulary.head())
    index = np.matrix(data_volcabulary.loc[:, ['index']].values)
    volcabulary = np.matrix(data_volcabulary.loc[:, ['volcabulary']].values)
    # find the spam words
    spam_words = volcabulary[w_index[0:15, 0], 0]
    print(spam_words)

Output---the spam words:

[['emailaddr']
['flash']
['tm']
['visit']
['our']
['remov']
['click']
['basenumb']
['will']
['numberb']
['wi']
['price']
['hear']
['tel']
['guarante']]