CS231N学习笔记2 Assignment1_Q1: k-Nearest Neighbor classifier

时间:2021-04-16 21:22:14

1.代码

1.1 dataPerpare.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by wjbKimberly on 17-10-31
import numpy as np
import sys
sys.path.append("../../../")
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt

def dataPrepare(cifar10_dir,num_training,num_test):
# Load the raw CIFAR-10 data.

X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

# As a sanity check, we print out the size of the training and test data.
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

#print
# Training data shape: (50000, 32, 32, 3)
# Training labels shape: (50000,)
# Test data shape: (10000, 32, 32, 3)
# Test labels shape: (10000,)

# Visualize some examples from the dataset.
# We show a few examples of training images from each class.
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
samples_per_class = 7
for y, cls in enumerate(classes):
idxs = np.flatnonzero(y_train == y)
idxs = np.random.choice(idxs, samples_per_class, replace=False)
for i, idx in enumerate(idxs):
plt_idx = i * num_classes + y + 1
plt.subplot(samples_per_class, num_classes, plt_idx)
plt.imshow(X_train[idx].astype('uint8'))
plt.axis('off')
if i == 0:
plt.title(cls)
# plt.show()


# Subsample the data for more efficient code execution in this exercise
# Only choose top 5000 in training data
# Only choose top 500 in test data

mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]


mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]

# Reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

return X_train, y_train, X_test, y_test

1.2 myKnn.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by wjbKimberly on 17-10-21

import numpy as np
from dataPerpare import dataPrepare
import sys
sys.path.append("../../../")
from cs231n.classifiers import KNearestNeighbor
import matplotlib.pyplot as plt

# Create a kNN classifier instance.
# Remember that training a kNN classifier is a noop:
# the Classifier simply remembers the data and does no further processing
# load data
cifar10_dir = '../../../cs231n/datasets/cifar-10-batches-py'
num_training = 5000
num_test = 500
X_train, y_train, X_test, y_test=dataPrepare(cifar10_dir,num_training,num_test)


# KNN begin
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)

# Open cs231n/classifiers/k_nearest_neighbor.py and implement
# compute_distances_two_loops.

# Test your implementation:
dists = classifier.compute_distances_two_loops(X_test)
print(dists.shape)


# We can visualize the distance matrix: each row is a single test example and
# its distances to training examples
plt.imshow(dists, interpolation='none')
plt.show()


# Now implement the function predict_labels and run the code below:
# We use k = 1 (which is Nearest Neighbor).
y_test_pred = classifier.predict_labels(dists, k=1)

# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))


y_test_pred = classifier.predict_labels(dists, k=5)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

# Output:
# Got 137 / 500 correct => accuracy: 0.274000
# Got 139 / 500 correct => accuracy: 0.278000

# Now lets speed up distance matrix computation by using partial vectorization
# with one loop. Implement the function compute_distances_one_loop and run the
# code below:
dists_one = classifier.compute_distances_one_loop(X_test)

# To ensure that our vectorized implementation is correct, we make sure that it
# agrees with the naive implementation. There are many ways to decide whether
# two matrices are similar; one of the simplest is the Frobenius norm. In case
# you haven't seen it before, the Frobenius norm of two matrices is the square
# root of the squared sum of differences of all elements; in other words, reshape
# the matrices into vectors and compute the Euclidean distance between them.
difference = np.linalg.norm(dists - dists_one, ord='fro')
print('Difference was: %f' % (difference, ))
if difference < 0.001:
print('Good! The distance matrices are the same')
else:
print('Uh-oh! The distance matrices are different')


# Now implement the fully vectorized version inside compute_distances_no_loops
# and run the code
dists_two = classifier.compute_distances_no_loops(X_test)

# check that the distance matrix agrees with the one we computed before:
difference = np.linalg.norm(dists - dists_two, ord='fro')
print('Difference was: %f' % (difference, ))
if difference < 0.001:
print('Good! The distance matrices are the same')
else:
print('Uh-oh! The distance matrices are different')


# Let's compare how fast the implementations are
def time_function(f, *args):
"""
Call a function f with args and return the time (in seconds) that it took to execute.
"""
import time
tic = time.time()
f(*args)
toc = time.time()
return toc - tic

two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)
print('Two loop version took %f seconds' % two_loop_time)

one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)
print('One loop version took %f seconds' % one_loop_time)

no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)
print('No loop version took %f seconds' % no_loop_time)

# you should see significantly faster performance with the fully vectorized implementation



def initialTraini(X_train_folds,y_train_folds,num_folds,fi):
X_ans=[]
y_ans=[]
for i in range(num_folds):
if i==fi:
continue
X_ans.append(X_train_folds[i])
y_ans.append(y_train_folds[i])
return X_ans,y_ans

# Cross-validation
# We have implemented the k-Nearest Neighbor classifier but we set the value k = 5 arbitrarily.
# We will now determine the best value of this hyperparameter with cross-validation.
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
################################################################################
# TODO: #
# Split up the training data into folds. After splitting, X_train_folds and #
# y_train_folds should each be lists of length num_folds, where #
# y_train_folds[i] is the label vector for the points in X_train_folds[i]. #
# Hint: Look up the numpy array_split function. #
################################################################################
X_train_folds=np.array_split(X_train,num_folds)
y_train_folds=np.array_split(y_train,num_folds)
################################################################################
# END OF YOUR CODE #
################################################################################
# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}
for i in k_choices:
k_to_accuracies[i]=[]
################################################################################
# TODO: #
# Perform k-fold cross validation to find the best value of k. For each #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times, #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all #
# values of k in the k_to_accuracies dictionary. #
################################################################################
for ki in k_choices:
for fi in range(num_folds):
#prepare the data
valindex=fi
X_traini = np.vstack((X_train_folds[0:fi]+X_train_folds[fi+1:num_folds]))
y_traini = np.hstack((y_train_folds[0:fi]+ y_train_folds[fi+1:num_folds]))


X_vali=np.array(X_train_folds[valindex])
y_vali = np.array(y_train_folds[valindex])
num_val=len(y_vali)

#initialize the KNN
classifier = KNearestNeighbor()
classifier.train(X_traini,y_traini)

#calculate the accuracy
dists = classifier.compute_distances_one_loop(X_vali)
y_val_pred = classifier.predict_labels(dists, k=5)
num_correct = np.sum(y_val_pred == y_vali)
accuracy = float(num_correct) / num_val
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
k_to_accuracies[ki].append(accuracy)
################################################################################
# END OF YOUR CODE #
################################################################################

# Print out the computed accuracies
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print('k = %d, accuracy = %f' % (k, accuracy))

# plot the raw observations
for k in k_choices:
accuracies = k_to_accuracies[k]
plt.scatter([k] * len(accuracies), accuracies)


# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()



# Based on the cross-validation results above, choose the best value for k,
# retrain the classifier using all the training data, and test it on the test
# data. You should be able to get above 28% accuracy on the test data.
best_k = 1

classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
y_test_pred = classifier.predict(X_test, k=best_k)

# Compute and display the accuracy
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

if __name__ == '__main__':
pass


1.3 k_nearest_neighbor.py

import numpy as np
from numpy import *#导入numpy的库函数

class KNearestNeighbor(object):
""" a kNN classifier with L2 distance """

def __init__(self):
pass

def train(self, X, y):
"""
Train the classifier. For k-nearest neighbors this is just
memorizing the training data.

Inputs:
- X: A numpy array of shape (num_train, D) containing the training data
consisting of num_train samples each of dimension D.
- y: A numpy array of shape (N,) containing the training labels, where
y[i] is the label for X[i].
"""
self.X_train = X
self.y_train = y

def predict(self, X, k=1, num_loops=0):
"""
Predict labels for test data using this classifier.

Inputs:
- X: A numpy array of shape (num_test, D) containing test data consisting
of num_test samples each of dimension D.
- k: The number of nearest neighbors that vote for the predicted labels.
- num_loops: Determines which implementation to use to compute distances
between training points and testing points.

Returns:
- y: A numpy array of shape (num_test,) containing predicted labels for the
test data, where y[i] is the predicted label for the test point X[i].
"""
if num_loops == 0:
dists = self.compute_distances_no_loops(X)
elif num_loops == 1:
dists = self.compute_distances_one_loop(X)
elif num_loops == 2:
dists = self.compute_distances_two_loops(X)
else:
raise ValueError('Invalid value %d for num_loops' % num_loops)

return self.predict_labels(dists, k=k)

def compute_distances_two_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a nested loop over both the training data and the
test data.

Inputs:
- X: A numpy array of shape (num_test, D) containing test data.

Returns:
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
is the Euclidean distance between the ith test point and the jth training
point.
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
for j in range(num_train):
distances = np.sqrt(np.sum(np.square(self.X_train[j] - X[i])))
dists[i,j]=distances
#####################################################################
# TODO: #
# Compute the l2 distance between the ith test point and the jth #
# training point, and store the result in dists[i, j]. You should #
# not use a loop over dimension. #
#####################################################################
#pass
#####################################################################
# END OF YOUR CODE #
#####################################################################
return dists

def compute_distances_one_loop(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a single loop over the test data.

Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
distances = np.sqrt(np.sum(np.square(self.X_train - X[i]),axis = 1))
dists[i, :] = distances
#######################################################################
# TODO: #
# Compute the l2 distance between the ith test point and all training #
# points, and store the result in dists[i, :]. #
#######################################################################
#######################################################################
# END OF YOUR CODE #
#######################################################################
return dists

def compute_distances_no_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using no explicit loops.

Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))

#########################################################################
# TODO: #
# Compute the l2 distance between all test points and all training #
# points without using any explicit loops, and store the result in #
# dists. #
# #
# You should implement this function using only basic array operations; #
# in particular you should not use functions from scipy. #
# #
# HINT: Try to formulate the l2 distance using matrix multiplication #
# and two broadcast sums. #
#########################################################################

M = np.dot(X, self.X_train.T)
nrow=M.shape[0]
ncol=M.shape[1]
te = np.diag(np.dot(X,X.T))
tr = np.diag(np.dot(self.X_train,self.X_train.T))
te= np.reshape(np.repeat(te,ncol),M.shape)
tr = np.reshape(np.repeat(tr, nrow), M.T.shape)
sq=-2 * M +te+tr.T
dists = np.sqrt(sq)

#ans
# M = np.dot(X, self.X_train.T)
# te = np.square(X).sum(axis=1)
# tr = np.square(self.X_train).sum(axis=1)
# dists = np.sqrt(-2 * M + tr + np.matrix(te).T)
# print(M.shape,te.shape,tr.shape,dists.shape)

#########################################################################
# END OF YOUR CODE #
#########################################################################
return dists

def predict_labels(self, dists, k=1):
"""
Given a matrix of distances between test points and training points,
predict a label for each test point.

Inputs:
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
gives the distance betwen the ith test point and the jth training point.

Returns:
- y: A numpy array of shape (num_test,) containing predicted labels for the
test data, where y[i] is the predicted label for the test point X[i].
"""
num_test = dists.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
# A list of length k storing the labels of the k nearest neighbors to
# the ith test point.
closest_y = []

#########################################################################
# TODO: #
# Use the distance matrix to find the k nearest neighbors of the ith #
# testing point, and use self.y_train to find the labels of these #
# neighbors. Store these labels in closest_y. #
# Hint: Look up the function numpy.argsort. #
#########################################################################

distances=dists[i,:]
indexes = np.argsort(distances)
closest_y=self.y_train[indexes[:k]]
#########################################################################
# TODO: #
# Now that you have found the labels of the k nearest neighbors, you #
# need to find the most common label in the list closest_y of labels. #
# Store this label in y_pred[i]. Break ties by choosing the smaller #
# label. #
#########################################################################

#Calculate the number of occurrences of all the numbers
count = np.bincount(closest_y)
y_pred[i] = np.argmax(count)
#########################################################################
# END OF YOUR CODE #
#########################################################################

return y_pred


2.Tricks:

1.sum(a,axis=0/1)

我们平时用的sum应该是默认的axis=0 就是按照列相加 

而当加入axis=1以后就是将一个矩阵的每一行向量相加

2.no loop实现L2距离

将完全平方展开,利用矩阵乘法加速。

Sqrt(sum ( (X1-x2)^2) )

X1^2+ x2 ^2- 2 * x1 * x2

test

self.X_traintrain

M = np.dot(Xself.X_train.T)
te = np.square(X).sum(axis=1)
tr = np.square(self.X_train).sum(axis=1)
dists = np.sqrt(-* M + tr + np.matrix(te).T)

 

print(M.shape,te.shape,tr.shape,dists.shape)

(500, 5000) (500,) (5000,) (500, 5000)