本文实例讲述了Python实现的knn算法。分享给大家供大家参考,具体如下:
有兴趣你们可以去了解下
具体代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
# -*- coding:utf-8 -*-
#! python2
'''''
@author:zhoumeixu
createdate:2015年8月27日
'''
#np.zeros((4,2))
#np.zeros(8).reshape(4,2)
#x=np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) np.zeros_like(x)
# 最值和排序:最值有np.max(),np.min() 他们都有axis和out(输出)参数,
# 而通过np.argmax(), np.argmin()可以得到取得最大或最小值时的 下标。
# 排序通过np.sort(), 而np.argsort()得到的是排序后的数据原来位置的下标
# 简单实现knn算法的基本思路
import numpy as np
import operator #运算符操作包
from _ctypes import Array
from statsmodels.sandbox.regression.kernridgeregress_class import plt_closeall
def createDataSet():
group = np.array([[ 1.0 , 1.1 ],[ 1.0 , 1.0 ],[ 0 , 0 ],[ 0 , 0.1 ]])
labels = [ 'A' , 'A' , 'B' , 'B' ]
return group ,labels
group,labels = createDataSet()
def classify0(inx,dataSet,labels,k):
dataSetSize = dataSet.shape[ 0 ]
diffMat = np.tile(inx,(dataSetSize, 1 )) - dataSet
sqDiffMat = diffMat * * 2
sqDistances = sqDiffMat. sum (axis = 1 )
distances = sqDistances * * 0.5 #计算距离 python中会自动广播的形式
sortedDistIndicies = distances.argsort() #排序,得到原来数据的在原来所在的下标
classCount = {}
for i in range (k):
voteIlabel = labels[sortedDistIndicies[i]] # 计算距离最近的值所在label标签
classCount[voteIlabel] = classCount.get(voteIlabel, 0 ) + 1 # 计算距离最近的值所在label标签,对前k哥最近数据进行累加
sortedClassCount = sorted (classCount.iteritems(),key = operator.itemgetter( 1 ),reverse = True ) #排序得到距离k个最近的数所在的标签
return sortedClassCount[ 0 ][ 0 ]
if __name__ = = '__main__' :
print (classify0([ 0 , 0 ],group,labels, 4 ))
# 利用knn算法改进约会网站的配对效果
def file2matrix(filename):
fr = open (filename)
arrayOLines = fr.readlines()
numberOfLines = len (arrayOLines)
returnMat = np.zeros((numberOfLines, 3 ))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split( '\t' )
returnMat[index,:] = listFromLine[ 0 : 3 ]
classLabelVector.append( int (listFromLine[ - 1 ]))
index + = 1
return returnMat ,classLabelVector #生成训练数据的array和目标array
path = u 'D:\\Users\\zhoumeixu204\\Desktop\\python语言机器学习\\机器学习实战代码 python\\机器学习实战代码\\machinelearninginaction\\Ch02\\'
datingDataMat,datingLabels = file2matrix(path + 'datingTestSet2.txt' )
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot( 111 )
ax.scatter(datingDataMat[:, 1 ],datingDataMat[:, 2 ])
plt.show()
ax.scatter(datingDataMat[:, 1 ],datingDataMat[:, 2 ], 15.0 * np.array(datingLabels), 15 * np.array(datingDataMat[:, 2 ]))
plt.show() #生成训练数据的array和目标array
def autoNorm(dataset):
minVals = dataset. min ( 0 )
maxVals = dataset. max ( 0 )
ranges = maxVals - minVals
normeDataSet = np.zeros(np.shape(dataset))
m = dataset.shape[ 0 ]
normDataSet = dataset - np.tile(minVals,(m, 1 ))
normDataSet = normDataSet / np.tile(ranges,(m, 1 ))
return normDataSet ,ranges,minVals
normMat,ranges,minVals = autoNorm(datingDataMat)
def datingClassTest():
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix(path + 'datingTestSet2.txt' )
normMat,ranges,minVals = autoNorm(datingDataMat)
m = normMat.shape[ 0 ]
numTestVecs = int (m * hoRatio)
errorCount = 0.0
for i in range (numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], datingLabels[numTestVecs:m], 3 )
print "the classifier came back with :%d,the real answer is :%d" \
% (classifierResult,datingLabels[i])
if classifierResult! = datingLabels[i]:
errorCount + = 1.0
print "the total error rare is :%f" % (errorCount / float (numTestVecs)) #利用knn算法测试错误率
if __name__ = = '__main__' :
datingClassTest()
#利用构建好的模型进行预测
def classifyPerson():
resultList = [ 'not at all' , 'in same doses' , 'in large d oses' ]
percentTats = float ( raw_input ( "percentage if time spent playin cideo games:" ))
ffMiles = float ( raw_input ( "frequnet fliter miles earned per year:" ))
iceCream = float ( raw_input ( "liters of ice cream consumed per year:" ))
datingDataMat,datingLabels = file2matrix(path + 'datingTestSet2.txt' )
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = np.array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr - minVals) / ranges,normMat,datingLabels, 3 )
print ( "you will probably like the person:" ,resultList[classifierResult - 1 ])
if __name__! = '__main__' :
classifyPerson()
#利用knn算法进行手写识别系统验证
path = u 'D:\\Users\\zhoumeixu204\\Desktop\\python语言机器学习\\机器学习实战代码 python\\机器学习实战代码\\machinelearninginaction\\Ch02\\'
def img2vector(filename):
returnVect = np.zeros(( 1 , 1024 ))
fr = open (filename)
for i in range ( 32 ):
lineStr = fr.readline()
for j in range ( 32 ):
returnVect[ 0 , 32 * i + j] = int (lineStr[j])
return returnVect
testVector = img2vector(path + 'testDigits\\0_13.txt' )
print (testVector[ 0 , 0 : 31 ])
import os
def handwritingClassTest():
hwLabels = []
trainingFileList = os.listdir(path + 'trainingDigits' )
m = len (trainingFileList)
trainingMat = np.zeros((m, 1024 ))
for i in range (m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split( '.' )[ 0 ]
classNumStr = int (fileStr.split( '_' )[ 0 ])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector(path + 'trainingDigits\\' + fileNameStr)
testFileList = os.listdir(path + 'testDigits' )
errorCount = 0.0
mTest = len (testFileList)
for j in range (mTest):
fileNameStr = testFileList[j]
fileStr = fileNameStr.split( '.' )[ 0 ]
classNumStr = int (fileNameStr.split( '_' )[ 0 ])
classNumStr = int (fileStr.split( '_' )[ 0 ])
vectorUnderTest = img2vector(path + 'testDigits\\' + fileNameStr)
classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels, 3 )
print ( "the classifier canme back with:%d,the real answer is :%d" % (classifierResult,classNumStr))
if classifierResult! = classNumStr:
errorCount + = 1.0
print ( "\nthe total number of errors is :%d" % errorCount)
print ( "\n the total error rate is :%f" % (errorCount / float (mTest)))
if __name__ = = '__main__' :
handwritingClassTest()
|
运行结果如下图:
注:这里使用到了statsmodels模块,可以点击此处本站下载statsmodels安装模块,再进入statsmodels模块所在目录位置,使用:
1
|
pip install statsmodels - 0.9 . 0 - cp27 - none - win32.whl
|
进行statsmodels模块的安装
同理,出现ImportError: No module named pandas错误提示时,点击此处本站下载pandas模块,再使用
1
|
pip install pandas - 0.23 . 1 - cp27 - none - win32.whl
|
进行pandas模块的安装
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/luoyexuge/article/details/49104367