以kaggle练习赛digit recognizer为例
1.kNN算法步骤
输入:训练数据集
输出:实例
(1)根据给定的距离度量,计算实例
(2)选出训练集合
2.kNN算法的距离度量
一般情况下,kNN算法会使用欧式距离.
设特征空间
这里
3.kNN算法实现(1)_numpy
代码中欧式距离的计算转化为
其中
(本人尝试了先求差值再平方再求和以及上述的展开计算两种方式,发现上述方式要比先求差值再平方再求和的方式要快)
#encoding:utf-8
import numpy as np
from collections import Counter
import pandas as pd
#读取文件
train_data = np.array(pd.read_csv('train.csv', header=0))
test_data = np.array(pd.read_csv('test.csv', header=0))
X_train = train_data[:,1:len(train_data[0])]
y_train = train_data[:,0] #存储训练数据集的label
X_test = test_data
print train_data.shape, test_data.shape, len(train_data[0])
class kNN():
def __init__(self):
pass
def train(self, X, y):
self.X_train = X
self.y_train = y
#欧式距离的计算
def compute_distance(self, X):
dot_pro = np.dot(X, self.X_train.T)
sum_square_test = np.square(X).sum(axis=1)
sum_square_train = np.square(self.X_train).sum(axis=1)
dists = np.sqrt(-2 * dot_pro + sum_square_train + np.matrix(sum_square_test).T)
return dists
#kNN
def simple_knn(self,X, k):
dists = self.compute_distance(X) #计算X与训练集的欧式距离
num_test = X.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
k_close_y = []
labels = self.y_train[np.argsort(dists[i, :])].flatten()
k_close_y = labels[:k] #取训练集中与X最临近前k个的元组的类别
c = Counter(k_close_y)
y_pred[i] = c.most_common(1)[0][0]
return y_pred
#进行预测
classifier = kNN()
classifier.train(X_train, y_train)
k = 1
predictions = []
for i in range(int(len(X_test)/2000)):
# predicts from i * batch_size to (i+1) * batch_size
print("Computing batch " + str(i+1) + "/" + str(int(len(X_test)/2000)) + "...")
predts = classifier.simple_knn(X_test[i * 2000:(i+1) * 2000], k)
predictions = predictions + list(predts)
outfile = open("results.csv", "w")
outfile.write("ImageId,Label\n")
for i in range(len(predictions)):
outfile.write(str(i+1) + "," + str(int(predictions[i])) + "\n")
outfile.close()
4.kNN算法实现(2)_sklearn
直接利用skleran包里的kNN求解,很方便,不过对于kNN的中间过程并没有表现,并不适合机器学习的初学者。
#encoding:utf-8
import numpy as np
from collections import Counter
import pandas as pd
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
traindata = np.array(pd.read_csv('train.csv', header = 0))
testdata = np.array(pd.read_csv('test.csv', header = 0))
model = KNeighborsClassifier(n_neighbors=3)
X_traindata = traindata[:,1:traindata.shape[1]]
label_traindata = traindata[:,0]
model.fit(X_traindata,label_traindata)
pre = model.predict(testdata)
print pre
out_file = open("result.csv", "w")
out_file.write("ImageId,Label\n")
for i in range(len(pre)):
out_file.write(str(i+1) + "," + str(int(pre[i])) + "\n")
out_file.close()