6. Support Vector Machine - ZYL-Harry/Machine_Learning_study GitHub Wiki
Basic Introduction
- Objective: find a hyperplane in an N-dimensional space that distinctly classifies the data points
- Classification: hard margin SVM, soft margin SVM, kernel SVM
Difference
- different cost function with the difference of the location of the regularization parameter λ
- different hypothesis
Large Margin
More detailed mathematical process can be seen in this link: Mathematical process of the basic Support Vector Machine
Kernel
- Part of the objective: define extra features using landmarks(initial features) and kernels(like similarity functions) to learn more complex nonlinear classifiers
Details:
- Other kernels:
1.polynomial kernel
2.string kernel
3.chi-square kernel
4.histogram intersection kernel
......
Tip:
Kernels can be applied to linear regression and logistic regression, but the computation will be very slow, and the SVM can run quite well with kernels
SVM parameter
- C
1.large C---means: small λ---low bias and high variance
2.small C---means: large λ---high bias and low variance - σ^2
1.large σ^2---means: features vary more smoothly---high bias and low variance
2.small σ^2---means: features vary less smoothly---low bias and high variance - γ(always used in code)
Multi classification
Advantage in application
Exercise by python
- The basic functions of SVM in python
sklearn.svm.SVC()
can be seen in this link: sklearn.svm.SVC
1. Support Vector Machines
1.1 read the dataset
def read_data(path):
data = loadmat(path)
X = data['X'] # (51,2)
y = data['y'] # (51,1)
return X, y
1.2 visiualize the dataset
def visiualize_data(X, y):
# print(np.argwhere(y == 1)[:, 0])
positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
plt.figure()
plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
plt.show()
Output for dataset1:
1.3 take the SVM algorithm
def training_data(X1, y1, C, kernel, tol, max_iter, gamma):
if kernel == 'linear':
model = svm.SVC(C=C, kernel=kernel, tol=tol, max_iter=max_iter)
elif kernel == 'rbf':
model = svm.SVC(C=C, kernel=kernel, gamma=gamma, tol=tol, max_iter=max_iter)
estimator = model.fit(X1, y1)
accuracy = model.score(X1, y1)
print('an instance of estimator is ', estimator)
print('the mean accuracy is ', accuracy)
return model
1.4 visiualize the dataset---without kernel(linear kernel)
def find_decision_boundary_linear(model, X, y):
# original examples
positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
# create the data for plotting
w = np.matrix(model.coef_)
b = np.matrix(model.intercept_)
x_min = 0
x_max = 4
x_plot = np.matrix(np.linspace(start=x_min, stop=x_max, num=100)).T
y_plot = - (w[0, 0] * x_plot + b) / w[0, 1]
# plot
plt.figure()
plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
plt.plot(x_plot, y_plot, color='b')
plt.show()
Output for dataset1 with C=1:
Output for dataset1 with C=100:
1.5 main function for dataset1
'''SVM without kernel(linear kernel)'''
# read the dataset
path1 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data1.mat'
X1, y1 = read_data(path1)
# visiualize the datset
visiualize_data(X1, y1)
# take the SVM algorithm
C1 = 100
kernel1 = 'linear'
tol1 = 1e-3
max_iter1 = 100
gamma1 = 0
model1 = training_data(X1, y1, C1, kernel1, tol1, max_iter1, gamma1)
# visiualize the classifier
find_decision_boundary_linear(model1, X1, y1)
1.6 main function for dataset2
'''SVM with Gaussian kernel'''
path2 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data2.mat'
X2, y2 = read_data(path2)
# visiualize the dataset
visiualize_data(X2, y2)
# take the SVM algorithm
C2 = 100
kernel2 = 'rbf'
tol2 = 1e-3
max_iter2 = 1000
sigma2 = 0.1 # the parameter in the gaussian kernel function
gamma2 = 1 / (2 * np.power(sigma2, 2)) # the parameter used in the svm functions when using the gaussian kernel
model2 = training_data(X2, y2, C2, kernel2, tol2, max_iter2, gamma2)
# visiualize the classifier
find_decision_boundary_gaussian(model2, X2, y2)
dataset2:
1.7 viviualize the dataset---Gaussian kernel(rbf kernel)
def find_decision_boundary_gaussian(model, X, y):
# original examples
positive = np.c_[X[np.argwhere(y == 1)[:, 0], :], y[np.argwhere(y == 1)[:, 0]]]
negative = np.c_[X[np.argwhere(y == 0)[:, 0], :], y[np.argwhere(y == 0)[:, 0]]]
# create the data for plotting
x1_plot = np.linspace(start=X[:, 0].min(), stop=X[:, 0].max(), num=100)
x2_plot = np.linspace(start=X[:, 1].min(), stop=X[:, 1].max(), num=100)
x1, x2 = np.meshgrid(x1_plot, x2_plot)
x1 = np.matrix(x1).T
x2 = np.matrix(x2).T
input = np.c_[x1.reshape((x1.shape[0] * x1.shape[1]), 1), x2.reshape((x2.shape[0] * x2.shape[1]), 1)]
values = np.matrix(model.predict(input)).T
values_plot = values.reshape(x1.shape)
# plot
plt.figure()
plt.scatter(x=positive[:, 0], y=positive[:, 1], color='k', marker='+')
plt.scatter(x=negative[:, 0], y=negative[:, 1], color='y', marker='o')
plt.contour(x1, x2, values_plot, [0])
plt.show()
Output for dataset2:
1.8 main function for dataset3
'''SVM with Gaussian kernel'''
# read the dataset
path3 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/ex6data3.mat'
data3 = loadmat(path3)
X3 = data3['X']
y3 = data3['y']
Xval3 = data3['Xval']
yval3 = data3['yval']
# visiualize the dataset
visiualize_data(X3, y3)
# determine the parameters C and sigmma
C3, sigma3, best_score = find_best_params(Xval3, yval3)
print('C_best = ', C3)
print('sigma_best = ', sigma3)
# take the svm algorithm
kernel3 = 'rbf'
tol3 = 1e-3
max_iter3 = 1000
gamma3 = 1 / (2 * np.power(sigma3, 2))
model3 = training_data(X3, y3, C3, kernel3, tol3, max_iter3, gamma3)
# visiualize the calssifier
find_decision_boundary_gaussian(model3, X3, y3)
dataset3:
1.9 find the best parameters for svm
def find_best_params(Xval, yval):
C_set = np.matrix([0.1, 0.3, 1, 3, 10, 30]).T
sigma_set = np.matrix([0.1, 0.3, 1, 3, 10, 30]).T
gamma_set = 1 / (2 * np.power(sigma_set, 2))
best_score = 0
best_C = 0
best_sigma = 0
for i in range(C_set.shape[0]):
for j in range(sigma_set.shape[0]):
model_temp = svm.SVC(C=C_set[i, 0], kernel='rbf', gamma=gamma_set[j, 0], tol=1e-3)
estimator_temp = model_temp.fit(Xval, yval)
accuracy_temp = model_temp.score(Xval, yval)
if accuracy_temp > best_score:
best_score = accuracy_temp
best_C = C_set[i, 0]
best_sigma = sigma_set[j, 0]
return best_C, best_sigma, best_score
Output for dataset3:
2. Spam Classification
2.1 training the svm algorithm
def training_data(X, y):
model = svm.SVC(C=100, kernel='linear')
estimator = model.fit(X, y)
accuracy = model.score(X, y)
print('an instance of estimator is ', estimator)
print('the mean accuracy is ', accuracy)
return model
2.2 main function
if __name__ == '__main__':
# read the dataset
path1 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/spamTrain.mat'
data1 = loadmat(path1)
X = data1['X']
y = data1['y']
# train svm algorithm
model = training_data(X, y)
# compute the mean accuracy of the test dataset
path2 = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/spamTest.mat'
data2 = loadmat(path2)
Xtest = data2['Xtest']
ytest = data2['ytest']
test_accuracy = model.score(Xtest, ytest)
print('the mean accuracy of the test dataset is ', test_accuracy)
# see which words the classifier thinks are the most predictive of spam
# find the index and the order of the words
w = np.matrix(model.coef_).T
w_sort = np.sort(-w, axis=0) # sort from largest to smallest
w_index = np.argsort(-w, axis=0)
w_new = np.c_[w_index + 1, w_sort]
# print(w_new[:15, :])
# get volcabulary list
path_volcabulary = 'D:/新建文件夹/机器学习/Machine_Learning_exercise/exercise_6/ex6/vocab.txt'
data_volcabulary = pd.read_csv(path_volcabulary, header=None, names=['index', 'volcabulary'], sep='\t')
# print(data_volcabulary.head())
index = np.matrix(data_volcabulary.loc[:, ['index']].values)
volcabulary = np.matrix(data_volcabulary.loc[:, ['volcabulary']].values)
# find the spam words
spam_words = volcabulary[w_index[0:15, 0], 0]
print(spam_words)
Output---the spam words:
[['emailaddr']
['flash']
['tm']
['visit']
['our']
['remov']
['click']
['basenumb']
['will']
['numberb']
['wi']
['price']
['hear']
['tel']
['guarante']]