机器学习之python: kNN

时间:2023-03-09 23:14:09
机器学习之python: kNN
 ##################################################
# kNN : k Nearest Neighbour
# Author : Monne
# Date : 2015-01-24
# Email : 416606639@qq.com
##################################################
import numpy as np
import time
starttime = time.time() """ too long , equal to classify()
def distance(xVec, yVec):
# 1. attain distance from xVec and yVec
x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4])
diff = x - y # x - y = array([-1, -1, -1])
diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1])
sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3
sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0
return sqrtsumdiff2 def disttest(testx, trainx):
# attain all the distance between testx and trainx[i]
# from distx {ID: distance}
distx = {}
numsample = len(trainx)
for i in range(numsample):
distx[i] = distance(testx, trainx[i])
return distx def sort(testx, trainx):
# sort distx {ID: distance}
# return IDk
distx = disttest(testx, trainx)
sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list
IDk = []; distances = []
l = len(trainx)
for i in range(l):
IDk.append(sortitems[i][0]) # ID
distances.append(sortitems[i][1]) # distance
#print "distances = ", distances[:5]
return IDk def majorcount(testx, trainx, trainy, k):
IDk = sort(testx, trainx)
sorty = {} # dist(y, count)
#l = len(trainx)
for i in range(k):
sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1
sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True) # list
#print "sorty = ",sorty
return sorty[0][0] def kNN(testx, trainx, trainy, k):
# given testx, trainx, trainy, k
# return predict y
c = classify(testx, trainx, trainy, k)
print "the classifier came back: % r" % c
return c
""" # step 1. data input
def testsample():
trainx = [[1.0, 1.1],
[1.0, 1.0],
[0, 0],
[0, 0.1]]
trainy = ['A', 'A', 'B', 'B']
return trainx, trainy def txt2trainxy(filename):
# 1.read from file
# 2.attain dataset: trainx and trainy
fr = open( filename +'.txt')
trainx = []; trainy = []
for line in fr.readlines():
l = line.split()
trainx.append(map(float,l[: -1]))
trainy.append(int(l[-1]))
return trainx,trainy def img2trainxy(filename):
trainx = []; trainy = []
from os import listdir
fl = listdir(filename) # fr = ['0_2.txt','0_1.txt']
for name in fl: # name = '0_2.txt'
trainy.append(int(name[0])) # name[0] = '0', int(name[0]) = int('0') = 0
fr = open(filename + '/' + name) # open('0_2.txt')
tx = []
for line in fr.readlines(): # line = '001100\r\n'
tx.extend(line.strip()) # line.strip() = '001100', tx = ['0','0,'1','1',...]
trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...]
return trainx, trainy # step 2. data transform
def norm(trainx):
max = np.array(trainx).max(0) # max(0) = max(axis = 0)
min = np.array(trainx).min(0)
diff = max - min
ntrainx = (np.array(trainx) - min) / map(float, diff)
return ntrainx.tolist(), min, map(float, diff) # step 3. classify function
def classify(testx, trainx, trainy, k):
diff = np.array(trainx) - np.array(testx)
diff2 = diff ** 2
sumdiff2 = diff2.sum(axis = 1)
sqrt = sumdiff2 ** 0.5
IDs = sqrt.argsort() # sorted index
sorty = {} # (y, count)
for i in range(k):
key = trainy[IDs[i]]
sorty[key] = sorty.get(key, 0) + 1
return sorted(sorty.iteritems(), key =
lambda d:d[1], reverse = True)[0][0] # step 4. test for error rate
def testkNN(testratio, trainx, trainy, k):
l = int(len(trainx) * testratio)
errorcount = 0
for i in range(l):
c = classify(trainx[i], trainx[l:], trainy[l:], k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != trainy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
#return (errorcount / float(l)) def randomtestkNN(testratio, trainx, trainy, k):
import random
m = len(trainx); l = int(m * 0.1)
testx = []; testy = []; s = [] # random choose k number in [0,l)
s = random.sample(range(m), l); b = list(set(range(m)) - set(s))
testx = [trainx[i] for i in s]
testy = [trainy[i] for i in s]
trainx = [trainx[i] for i in b]
trainy = [trainy[i] for i in b]
"""
for i in range(l):
s = random.randint(0, m - 1) #[0,m] include m and maybe repeat
dels.append(s)
testx.append(trainx[s])
testy.append(trainy[s])
trainx = [trainx[i] for i in range(m) if i not in dels]
trainy = [trainy[i] for i in range(m) if i not in dels]
""" errorcount = 0
for i in range(l):
c = classify(testx[i], trainx, trainy, k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != testy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
return (errorcount / float(l)) def avg():
a = []
for i in range(1,10):
#print i
a.append(handwriting('trainingDigits', 'testDigits', i))
a = np.array(a)
print a
print a.argsort()
# k = 4, errormin = 0.03 # step 5_1 small sample
def sample(k):
trainx, trainy = testsample()
testkNN(trainx, trainy, k) # step 5_2. use for dating web site
def datingwebsite(filename, k):
## step 1: load data
print "step 1: load data..."
trainx, trainy = txt2trainxy(filename) # must str like 'datingTestSet2', not datingTestSet2
trainx, min, diff = norm(trainx) ## step 2: training...
print "step 2: training..."
pass ## step 3: testing...
print "step 3: testing..."
randomtestkNN(0.10, trainx, trainy, k)
#testkNN(0.10, trainx, trainy, k)
print "time cost: ", (time.time() - starttime) ## step 4: show the result...
print "step 4: show the result..."
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(raw_input(
"percentage of time spent playing video games?> "))
ffMiles = float(raw_input("frequent flier miles earned per year?> "))
iceCream = float(raw_input("liters of ice cream consumed per year?> "))
classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff
classy = classify(classx, trainx, trainy, k)
print "You will probably like this person: ", resultList[classy - 1] return (errorcount / float(l)) # step 5_3. use for hand writing
def handwriting(trainfile, testfile, k):
## step 1: load data...
print "step 1: load data..."
print "---Getting training set..."
trainx, trainy = img2trainxy(trainfile)
print "---Geting testing set..."
testx, testy = img2trainxy(testfile)
m = len(trainx)
print m, len(trainx[0])
print len(testx), len(testx[0]) # random choose trainx
print "---Random choosing the training data..."
import random
n = random.randint(0, m - 1) # random numbers
s = random.sample(range(m), n) # random samples
trainx = [trainx[i] for i in s]
trainy = [trainy[i] for i in s]
print "---the numbers of training data is: ", n ## step 2: training...
print "step 2: training..."
pass ## step 3: testing...
print "step 3: testing..."
l = len(testx)
errorcount = 0
for i in range(l):
c = classify(testx[i], trainx, trainy, k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != testy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
print "time cost: ", (time.time() - starttime) ## step 4: show the result...
print "step 4: show the result..."
pass return (errorcount / float(l)) #datingwebsite('datingTestSet2', 4) handwriting('trainingDigits', 'testDigits', 3) #avg()