实验环境
Windows7
Anaconda3
matplotlib
seaborn
pandas
numpy
实验内容
在Jupyter notebook中利用numpy、pandas、matplotlib、seaborn进行模型的训练和优化,为后面预测房价提供算法模型。
实验步骤
一、数据来源
特征处理后的训练数据:https://download.csdn.net/download/qq_35809147/11178563
二、代码解析
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
train_data = pd.read_csv(u'H://特征处理_train_data.csv')
train_data.head()
train_data.info()
%matplotlib notebook
sns.set(style='whitegrid', context='notebook') # 设定样式,还原可用 sns.reset_orig
cols = train_data.columns
cm = np.corrcoef(train_data.values.T) # 计算相关系数
sns.set(font_scale=1.5)
# 画相关系数矩阵的热点图
hm = sns.heatmap(cm,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 15},
yticklabels=cols,
xticklabels=cols)
plt.tight_layout()
sns.reset_orig()
sns.reset_orig()
y = train_data.Hand
X = train_data.drop(['Hand'], axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
estimator = KNeighborsClassifier()
model = estimator.fit(X_train, y_train)
print("R^2 为{0:.2f}".format(model.score(X_test, y_test)))
predict = model.predict(X_test)
print('准确度为:{0:.2f}'.format(np.mean(y_test==predict)*100))
%matplotlib notebook
#优化模型——设置参数,应为近邻个数对邻近算法影响最大,所以从n_neighbores出发来优化模型
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
all_scores = []
avg_scores =[]
for n_neighbors in range(1,21):
estimator = KNeighborsClassifier(n_neighbors=n_neighbors)
scores = cross_val_score(estimator, X = X, y = y)
all_scores.append(scores)
avg_scores.append(np.mean(scores))
import matplotlib.patches as mpatches
el = mpatches.Ellipse((2.98, 0.86),0.3, 0.4 , angle=30, alpha=0.2)
plt.figure(figsize=(9, 6))
plt.plot(range(1, 21), avg_scores, 'bo-')
plt.xlabel(u'近邻个数')
plt.ylabel(u'效果得分')
#plt.annotate(u'最优值', xy = (2.52, 0.86), xytext=(6.39, 0.86), arrowprops=dict(arrowstyle="fancy",color="0.5", patchB=el,shrinkB=0,connectionstyle="arc3,rad=0.3"))
#由上图可以得出当n_neighbors的值为1是,得分最高,
max_score = avg_scores[0]
print(u"最优值为:{0:.2f}".format(max_score))
for parameter_value, scores in zip(range(1, 21), all_scores):
n = len(scores)
plt.plot([parameter_value]*n, scores, 'bo-')
from sklearn.neighbors import KNeighborsClassifier
estimator = KNeighborsClassifier(n_neighbors=1)
model = estimator.fit(X_train, y_train)
print("R^2 为{0:.2f}".format(model.score(X_test, y_test)))
predict = model.predict(X_test)
print('准确度为:{0:.2f}%'.format(np.mean(y_test==predict)*100))
from sklearn.neighbors import KNeighborsClassifier
estimator = KNeighborsClassifier(n_neighbors=1)
predict_model = estimator.fit(X, y)
#将训练的模型保存到文件当中
from sklearn.externals import joblib
joblib.dump(predict_model, 'd://predict_model.m')
三、训练后的模型下载:
训练后的模型:https://download.csdn.net/download/qq_35809147/11179050