Knn算法的思想简单说就是:看输入的sample点周围的k个点都属于哪个类,哪个类的点最多,就把sample归为哪个类。也就是说,训练集是一些已经被手动打好标签的数据,knn会根据你打好的标签来挖掘同类对象的相似点,从而推算sample的标签。
Knn算法的准确度受k影响较大,可能需要写个循环试一下选出针对不同数据集的最优的k。
至于如何拿到特征向量,可以参考之前的博文。
代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#-*- coding: utf-8 -*-
__author__ = 'Rossie'
from numpy import *
import operator
'''构造数据'''
def createDataSet():
characters = array([[ 1.0 , 1.1 ],[ 1.0 , 1.0 ],[ 0 , 0 ],[ 0 , 0.1 ]])
labels = [ 'A' , 'A' , 'B' , 'B' ]
return characters,labels
'''从文件中读取数据,将文本记录转换为矩阵,提取其中特征和类标'''
def file2matrix(filename):
fr = open (filename)
arrayOLines = fr.readlines()
numberOfLines = len (arrayOLines) #得到文件行数
returnMat = zeros((numberOfLines, 3 )) #创建以零填充的numberOfLines*3的NumPy矩阵
classLabelVector = []
index = 0
for line in arrayOLines: #解析文件数据到列表
line = line.strip()
listFromLine = line.split( '\t' )
returnMat[index, :] = listFromLine[ 0 : 3 ]
classLabelVector.append(listFromLine[ - 1 ])
index + = 1
return returnMat,classLabelVector #返回特征矩阵和类标集合
'''归一化数字特征值到0-1范围'''
'''输入为特征值矩阵'''
def autoNorm(dataSet):
minVals = dataSet. min ( 0 )
maxVals = dataSet. max ( 0 )
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[ 0 ]
normDataSet = dataSet - tile(minVals,(m, 1 ))
normDataSet = normDataSet / tile(ranges,(m, 1 ))
return normDataSet,ranges, minVals
def classify(sample,dataSet,labels,k):
dataSetSize = dataSet.shape[ 0 ] #数据集行数即数据集记录数
'''距离计算'''
diffMat = tile(sample,(dataSetSize, 1 )) - dataSet #样本与原先所有样本的差值矩阵
sqDiffMat = diffMat * * 2 #差值矩阵平方
sqDistances = sqDiffMat. sum (axis = 1 ) #计算每一行上元素的和
distances = sqDistances * * 0.5 #开方
sortedDistIndicies = distances.argsort() #按distances中元素进行升序排序后得到的对应下标的列表
'''选择距离最小的k个点'''
classCount = {}
for i in range (k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0 ) + 1
'''从大到小排序'''
sortedClassCount = sorted (classCount.items(),key = operator.itemgetter( 1 ),reverse = True )
return sortedClassCount[ 0 ][ 0 ]
'''针对约会网站数据的测试代码'''
def datingClassTest():
hoRatio = 0.20 #测试样例数据比例
datingDataMat,datingLabels = file2matrix( 'datingTestSet1.txt' )
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[ 0 ]
numTestVecs = int (m * hoRatio)
errorCount = 0.0
k = 4
for i in range (numTestVecs):
classifierResult = classify(normMat[i, : ],normMat[numTestVecs:m, : ],datingLabels[numTestVecs:m],k)
print ( "The classifier came back with: %s, thereal answer is: %s" % (classifierResult, datingLabels[i]))
if (classifierResult! = datingLabels [i] ) :
errorCount + = 1.0
print ( "the total error rate is: %f" % (errorCount / float (numTestVecs)))
def main():
sample = [ 0 , 0 ] #简单样本测试
sampleText = [ 39948 , 6.830795 , 1.213342 ] #文本中向量样本测试
k = 3
group,labels = createDataSet()
label1 = classify(sample,group,labels,k) #简单样本的分类结果
fileN = "datingTestSet.txt"
matrix,label = file2matrix(fileN)
label2 = classify(sampleText,matrix,label,k) #文本样本的分类结果
print ( "ClassifiedLabel of the simple sample:" + label1)
print ( "Classified Label of the textsample:" + label2)
if __name__ = = '__main__' :
main()
#datingClassTest()
|
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/RossieSeven/article/details/52629520