KNN - juedaiyuer/researchNote GitHub Wiki
#K-近邻#
KNN.py文件
需要导入的模块
from numpy import *
import operator
改变当前路径到存储KNN.py文件的位置,当前python下有效
import sys
sys.path.append("file/loc")
import 模块
#自己使用的路径
sys.path.append("/home/juedaiyuer/mycode/researchNote/machinelearning/Ch02")
导入kNN.py文件
>>> import kNN
kNN模块定义了函数createDataSet,创建变量group和labels
group有4组数据,每组数据有两个已知的属性或者特征值
label每个数据点的标签信息,包含的元素个数等于group矩阵行数
>>> group,labels=kNN.createDataSet()
>>> group
array([[ 1. , 1.1],
[ 1. , 1. ],
[ 0. , 0. ],
[ 0. , 0.1]])
>>> labels
['A', 'A', 'B', 'B']
##文本文件中解析数据(分类器)##
'''
输入向量inX 用于分类
输入的训练样本集dataSet
标签向量labels
k 选择最近邻居的数目
标签向量的元素数目和矩阵dataSet的行数相同
'''
def classify0(inX, dataSet, labels, k):
#距离计算
#数学公式:计算空间上两点的距离
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
#计算完所有点的距离,数据排序
#classCount字典
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
为了预测数据所在的分类
>>> kNN.classify0([0,0],group,labels,3)
'B'
测试分类器的效果,使用已知答案的数据,检验分类器给出的结果是否符合预期结果,错误率是常用的评估方法
##改进约会网站配对效果##
###准备数据###
从文本文件中解析数据dataTestSet2.txt
将待处理数据的格式改变为分类器可以接受的格式
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip() #截取掉所有的回车字符
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
输入命令
>>> reload(kNN)
>>> datingDataMat,datingLabels=kNN.file2matrix('datingTestSet2.txt')
成功导入文件中的数据后,可以简单检查一下数据内容
>>> datingDataMat
array([[ 4.09200000e+04, 8.32697600e+00, 9.53952000e-01],
[ 1.44880000e+04, 7.15346900e+00, 1.67390400e+00],
[ 2.60520000e+04, 1.44187100e+00, 8.05124000e-01],
...,
[ 2.65750000e+04, 1.06501020e+01, 8.66627000e-01],
[ 4.81110000e+04, 9.13452800e+00, 7.28045000e-01],
[ 4.37570000e+04, 7.88260100e+00, 1.33244600e+00]])
>>> datingLabels[0:20]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]
###创建散点图###
使用matplotlib制作原始数据的散点图
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> fig=plt.figure()
>>> ax=fig.add_subplot(111)
>>> ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
<matplotlib.collections.PathCollection object at 0x7f9dc05aedd0>
>>> plt.show()
散点图使用datingDataMat矩阵的第二,三列数据
没有使用样本分类的特征值,难以辨别图中的点究竟属于哪个样本分类
Matplotlib库提供的scatter函数支持个性化标记散点图的点
###准备数据###
归一化数值
newValue=(oldValue-min)/(max-min)
代码如下
def autoNorm(dataSet):
#每列选取最小值,参数0可以使函数从列中选取最小值
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
重新加载kNN.py模块
>>> reload(kNN)
>>> normMat,ranges,minVals=kNN.autoNorm(datingDataMat)
>>> normMat
array([[ 0.44832535, 0.39805139, 0.56233353],
[ 0.15873259, 0.34195467, 0.98724416],
[ 0.28542943, 0.06892523, 0.47449629],
...,
[ 0.29115949, 0.50910294, 0.51079493],
[ 0.52711097, 0.43665451, 0.4290048 ],
[ 0.47940793, 0.3768091 , 0.78571804]])
>>> ranges
array([ 9.12730000e+04, 2.09193490e+01, 1.69436100e+00])
>>> minVals
array([ 0. , 0. , 0.001156])
###测试:验证分类器###
分类器针对约会网站的测试代码
def datingClassTest():
hoRatio = 0.50 #hold out 10%
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
##手写识别系统##
##source##
- 机器学习实战:第2章 k-近邻算法