6. Support Vector Machine - ZYL-Harry/Machine_Learning_study GitHub Wiki

Basic Introduction

  • Objective: find a hyperplane in an N-dimensional space that distinctly classifies the data points
  • Classification: hard margin SVM, soft margin SVM, kernel SVM

Difference

image

  • different cost function with the difference of the location of the regularization parameter λ
  • different hypothesis

Large Margin

image
More detailed mathematical process can be seen in this link: Mathematical process of the basic Support Vector Machine

Kernel

  • Part of the objective: define extra features using landmarks(initial features) and kernels(like similarity functions) to learn more complex nonlinear classifiers image

Details:
image
image
image

  • Other kernels:
    1.polynomial kernel
    2.string kernel
    3.chi-square kernel
    4.histogram intersection kernel
    ......

Tip:
Kernels can be applied to linear regression and logistic regression, but the computation will be very slow, and the SVM can run quite well with kernels

SVM parameter

  • C
    1.large C---means: small λ---low bias and high variance
    2.small C---means: large λ---high bias and low variance
  • σ^2
    1.large σ^2---means: features vary more smoothly---high bias and low variance
    2.small σ^2---means: features vary less smoothly---low bias and high variance
  • γ(always used in code)
    image

Multi classification

image

Advantage in application

image

Exercise by python

  • The basic functions of SVM in python sklearn.svm.SVC() can be seen in this link: sklearn.svm.SVC

1. Support Vector Machines

1.1 read the dataset

def read_data(path):
    data = loadmat(path)
    X = data['X']   # (51,2)
    y = data['y']   # (51,1)
    return X, y

1.2 visiualize the dataset

def visiualize_data(X, y):
    # print(np.argwhere(y == 1)[:, 0])
    positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
    negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
    plt.figure()
    plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
    plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
    plt.show()

Output for dataset1:
ex6data1

1.3 take the SVM algorithm

def training_data(X1, y1, C, kernel, tol, max_iter, gamma):
    if kernel == 'linear':
        model = svm.SVC(C=C, kernel=kernel, tol=tol, max_iter=max_iter)
    elif kernel == 'rbf':
        model = svm.SVC(C=C, kernel=kernel, gamma=gamma, tol=tol, max_iter=max_iter)
    estimator = model.fit(X1, y1)
    accuracy = model.score(X1, y1)
    print('an instance of estimator is ', estimator)
    print('the mean accuracy is ', accuracy)
    return model

1.4 visiualize the dataset---without kernel(linear kernel)

def find_decision_boundary_linear(model, X, y):
    # original examples
    positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
    negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
    # create the data for plotting
    w = np.matrix(model.coef_)
    b = np.matrix(model.intercept_)
    x_min = 0
    x_max = 4
    x_plot = np.matrix(np.linspace(start=x_min, stop=x_max, num=100)).T
    y_plot = - (w[0, 0] * x_plot + b) / w[0, 1]
    # plot
    plt.figure()
    plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
    plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
    plt.plot(x_plot, y_plot, color='b')
    plt.show()

Output for dataset1 with C=1:
ex6data1_fitting_C1
Output for dataset1 with C=100:
ex6data1_fitting_C100

1.5 main function for dataset1

'''SVM without kernel(linear kernel)'''
# read the dataset
path1 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data1.mat'
X1, y1 = read_data(path1)
# visiualize the datset
visiualize_data(X1, y1)
# take the SVM algorithm
C1 = 100
kernel1 = 'linear'
tol1 = 1e-3
max_iter1 = 100
gamma1 = 0
model1 = training_data(X1, y1, C1, kernel1, tol1, max_iter1, gamma1)
# visiualize the classifier
find_decision_boundary_linear(model1, X1, y1)

1.6 main function for dataset2

'''SVM with Gaussian kernel'''
path2 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data2.mat'
X2, y2 = read_data(path2)
# visiualize the dataset
visiualize_data(X2, y2)
# take the SVM algorithm
C2 = 100
kernel2 = 'rbf'
tol2 = 1e-3
max_iter2 = 1000
sigma2 = 0.1 # the parameter in the gaussian kernel function
gamma2 = 1 / (2 * np.power(sigma2, 2))   # the parameter used in the svm functions when using the gaussian kernel
model2 = training_data(X2, y2, C2, kernel2, tol2, max_iter2, gamma2)
# visiualize the classifier
find_decision_boundary_gaussian(model2, X2, y2)

dataset2:
ex6data2

1.7 viviualize the dataset---Gaussian kernel(rbf kernel)

def find_decision_boundary_gaussian(model, X, y):
    # original examples
    positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
    negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
    # create the data for plotting
    x1_plot = np.linspace(start=X[:, 0].min(), stop=X[:, 0].max(), num=100)
    x2_plot = np.linspace(start=X[:, 1].min(), stop=X[:, 1].max(), num=100)
    x1, x2 = np.meshgrid(x1_plot, x2_plot)
    x1 = np.matrix(x1).T
    x2 = np.matrix(x2).T
    input = np.c_[x1.reshape((x1.shape[0] * x1.shape[1]), 1), x2.reshape((x2.shape[0] * x2.shape[1]), 1)]
    values = np.matrix(model.predict(input)).T
    values_plot = values.reshape(x1.shape)
    # plot
    plt.figure()
    plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
    plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
    plt.contour(x1, x2, values_plot, [0])
    plt.show()

Output for dataset2:
ex6data2_fitting

1.8 main function for dataset3

'''SVM with Gaussian kernel'''
    # read the dataset
    path3 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data3.mat'
    data3 = loadmat(path3)
    X3 = data3['X']
    y3 = data3['y']
    Xval3 = data3['Xval']
    yval3 = data3['yval']
    # visiualize the dataset
    visiualize_data(X3, y3)
    # determine the parameters C and sigmma
    C3, sigma3, best_score = find_best_params(Xval3, yval3)
    print('C_best = ', C3)
    print('sigma_best = ', sigma3)
    # take the svm algorithm
    kernel3 = 'rbf'
    tol3 = 1e-3
    max_iter3 = 1000
    gamma3 = 1 / (2 * np.power(sigma3, 2))
    model3 = training_data(X3, y3, C3, kernel3, tol3, max_iter3, gamma3)
    # visiualize the calssifier
    find_decision_boundary_gaussian(model3, X3, y3)

dataset3:
ex6data3

1.9 find the best parameters for svm

def find_best_params(Xval, yval):
    C_set = np.matrix([0.1, 0.3, 1, 3, 10, 30]).T
    sigma_set = np.matrix([0.1, 0.3, 1, 3, 10, 30]).T
    gamma_set = 1 / (2 * np.power(sigma_set, 2))
    best_score = 0
    best_C = 0
    best_sigma = 0
    for i in range(C_set.shape[0]):
        for j in range(sigma_set.shape[0]):
            model_temp = svm.SVC(C=C_set[i, 0], kernel='rbf', gamma=gamma_set[j, 0], tol=1e-3)
            estimator_temp = model_temp.fit(Xval, yval)
            accuracy_temp = model_temp.score(Xval, yval)
            if accuracy_temp > best_score:
                best_score = accuracy_temp
                best_C = C_set[i, 0]
                best_sigma = sigma_set[j, 0]
    return best_C, best_sigma, best_score

Output for dataset3:
ex6data3_fitting

2. Spam Classification

2.1 training the svm algorithm

def training_data(X, y):
    model = svm.SVC(C=100, kernel='linear')
    estimator = model.fit(X, y)
    accuracy = model.score(X, y)
    print('an instance of estimator is ', estimator)
    print('the mean accuracy is ', accuracy)
    return model

2.2 main function

if __name__ == '__main__':
    # read the dataset
    path1 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/spamTrain.mat'
    data1 = loadmat(path1)
    X = data1['X']
    y = data1['y']
    # train svm algorithm
    model = training_data(X, y)
    # compute the mean accuracy of the test dataset
    path2 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/spamTest.mat'
    data2 = loadmat(path2)
    Xtest = data2['Xtest']
    ytest = data2['ytest']
    test_accuracy = model.score(Xtest, ytest)
    print('the mean accuracy of the test dataset is ', test_accuracy)
    # see which words the classifier thinks are the most predictive of spam
    # find the index and the order of the words
    w = np.matrix(model.coef_).T
    w_sort = np.sort(-w, axis=0)    # sort from largest to smallest
    w_index = np.argsort(-w, axis=0)
    w_new = np.c_[w_index + 1, w_sort]
    # print(w_new[:15, :])
    # get volcabulary list
    path_volcabulary = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/vocab.txt'
    data_volcabulary = pd.read_csv(path_volcabulary, header=None, names=['index', 'volcabulary'], sep='\t')
    # print(data_volcabulary.head())
    index = np.matrix(data_volcabulary.loc[:, ['index']].values)
    volcabulary = np.matrix(data_volcabulary.loc[:, ['volcabulary']].values)
    # find the spam words
    spam_words = volcabulary[w_index[0:15, 0], 0]
    print(spam_words)

Output---the spam words:

[['emailaddr']
['flash']
['tm']
['visit']
['our']
['remov']
['click']
['basenumb']
['will']
['numberb']
['wi']
['price']
['hear']
['tel']
['guarante']]