优秀相关博客参考链接:http://www.cnblogs.com/pinard/p/6053344.html
一、基础知识——信息熵与条件信息熵
二、决策树的定义与直观理解
三、决策树类库介绍——DecisionTreeClassifier 和 DecisionTreeRegressor
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:ZhengzhengLiu #鸢尾花数据分类——决策树 from sklearn import tree #决策树 from sklearn.tree import DecisionTreeClassifier #决策分类树 from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV #网格搜索交叉验证 from sklearn.pipeline import Pipeline #管道 from sklearn.preprocessing import MinMaxScaler #数据归一化 from sklearn.feature_selection import SelectKBest #特征选择 from sklearn.feature_selection import chi2 #卡方统计量 from sklearn.decomposition import PCA #主成分分析 import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt #解决中文显示问题 mpl.rcParams['font.sans-serif']=[u'simHei'] mpl.rcParams['axes.unicode_minus']=False #导入数据 path = "./datas/iris.data" data = pd.read_csv(path,header=None) iris_feature_E = "sepal length","sepal width","petal length","petal width" iris_feature_C = u"花萼长度",u"花萼宽度",u"花瓣长度",u"花瓣宽度" iris_class = "Iris-setosa","Iris-versicolor","Iris-virginica" #数据分割 x = data[np.arange(0,4)] #获取x变量 #x = data[list(range(4))] #与上面一句等价 #print(x.head()) y = pd.Categorical(data[4]).codes #Categorical:编码包含大量重复文本的数据,codes把数据y转换成分类型的0,1,2 print("样本总数:%d;特征属性数目:%d" %x.shape) print(y) #划分训练集与测试集 x_train1, x_test1, y_train1, y_test1 = train_test_split(x,y,test_size=0.2,random_state=14) x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1 print("训练数据集样本总数:%d;测试数据集样本总数:%d" %(x_train.shape[0],x_test.shape[0])) #对数据集进行标准化 ss = MinMaxScaler() x_train = ss.fit_transform(x_train,y_train) x_test = ss.transform(x_test) print("原始数据各个特征的调整最小值:",ss.min_) print("原始数据各个特征的缩放数据值:",ss.scale_) #特征选择:从已有的特征属性中选择出影响目标最大的特征属性 #常用方法:{分类:F统计量、卡方系数、互信息mutual_info_classif # 连续:皮尔逊相关系数、F统计量、互信息mutual_info_classif} #SelectKBest(卡方系数) ch2 = SelectKBest(chi2,k=3) #当前案例中,用SelectKBest方法从四个原始特征属性中选择出最能影响目标的3个特征属性 # k 默认为10,指定后会返回想要的特征个数 x_train = ch2.fit_transform(x_train,y_train) #训练并转换 x_test = ch2.transform(x_test) #转换 select_name_index = ch2.get_support(indices=True) print("对类别判别影响最大的三个特征属性分别是:",ch2.get_support(indices=False)) print(select_name_index) #降维:对于数据而言,如果特征属性比较多,在构建过程中会比较复杂, # 这时将多维(高维)降到低维空间中 #常用的降维方法:PCA 主成分分析(无监督);人脸识别通常先做一次PCA # LDA 线性判别分析(有监督),类内方差最小 pca = PCA(n_components=2) #构建一个PCA对象,设置最终维度为2维 #这里为了后边画图方便,将数据维度设置为 2,一般用默认不设置就可以 x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) #模型构建 model = DecisionTreeClassifier(criterion="entropy",random_state=0) #模型训练 model.fit(x_train,y_train) #模型预测 y_test_hat = model.predict(x_test) #利用数据可视化软件Graphviz打印出决策树 #from sklearn.externals.six import StringIO #with open("iris.dot") as f: #f = tree.export_graphviz(model,out_file=f) print("Score:",model.score(x_test,y_test)) print("Classes:",model.classes_) N = 100 x1_min = np.min((x_train.T[0].min(),x_test.T[0].min())) x1_max = np.max((x_train.T[0].max(),x_test.T[0].max())) x2_min = np.min((x_train.T[1].min(),x_test.T[1].min())) x2_max = np.max((x_train.T[1].max(),x_test.T[1].max())) t1 = np.linspace(x1_min,x1_max,N) t2 = np.linspace(x2_min,x2_max,N) x1,x2 = np.meshgrid(t1,t2) #生成网格采样点 x_show = np.dstack((x1.flat,x2.flat))[0] y_show_hat = model.predict(x_show) y_show_hat = y_show_hat.reshape(x1.shape) print(y_show_hat.shape) print(y_show_hat[0]) #画图 plt_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) plt_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) plt.figure(facecolor="w") plt.pcolormesh(x1,x2,y_show_hat,cmap=plt_light) plt.scatter(x_test.T[0],x_test.T[1],c=y_test.ravel(),edgecolors="k", s=150,zorder=10,cmap=plt_dark,marker="*") #测试数据 plt.scatter(x_train.T[0],x_train.T[1],c=y_train.ravel(),edgecolors="k", s=40,cmap=plt_dark) #全部数据 plt.xlabel(u"特征属性1",fontsize=15) plt.ylabel(u"特征属性2",fontsize=15) plt.xlim(x1_min,x1_max) plt.ylim(x2_min,x2_max) plt.grid(True) plt.title(u"鸢尾花数据的决策树分类",fontsize=18) plt.savefig("鸢尾花数据的决策树分类.png") plt.show() #参数优化 pipe = Pipeline([ ('mms', MinMaxScaler()), ('skb', SelectKBest(chi2)), ('pca', PCA()), ('decision', DecisionTreeClassifier()) ]) # 参数 parameters = { "skb__k": [1,2,3,4], "pca__n_components": [0.5,1.0], "decision__criterion": ["gini", "entropy"], "decision__max_depth": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] } x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1 gscv = GridSearchCV(pipe, param_grid=parameters) gscv.fit(x_train2, y_train2) print("最优参数列表:",gscv.best_params_) print ("score值:",gscv.best_score_) y_test_hat2 = gscv.predict(x_test2) mms_best = MinMaxScaler() skb_best = SelectKBest(chi2,k=2) pca_best = PCA(n_components=0.5) decision3 = DecisionTreeClassifier(criterion="gini",max_depth=2) x_train3, x_test3, y_train3, y_test3 = x_train1, x_test1, y_train1, y_test1 x_train3 = pca_best.fit_transform(skb_best.fit_transform(mms_best.fit_transform(x_train3,y_train3),y_train3)) x_test3 = pca_best.transform(skb_best.transform(mms_best.transform(x_test3))) decision3.fit(x_train3,y_train3) print("正确率:",decision3.score(x_test3,y_test3)) x_train4, x_test4, y_train4, y_test4 = train_test_split(x.iloc[:, :2], y, train_size=0.7, random_state=14) depths = np.arange(1, 15) err_list = [] for d in depths: clf = DecisionTreeClassifier(criterion='gini', max_depth=d) clf.fit(x_train4, y_train4) score = clf.score(x_test4, y_test4) err = 1 - score err_list.append(err) print("%d深度,正确率%.5f" % (d, score)) ## 画图 plt.figure(facecolor='w') plt.plot(depths, err_list, 'ro-', lw=3) plt.xlabel(u'决策树深度', fontsize=16) plt.ylabel(u'错误率', fontsize=16) plt.grid(True) plt.title(u'决策树层次太多导致的拟合问题(欠拟合和过拟合)', fontsize=18) plt.savefig("决策树层次太多导致的拟合问题(欠拟合和过拟合).png") plt.show() #运行结果: 样本总数:150;特征属性数目:4 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] 训练数据集样本总数:120;测试数据集样本总数:30 原始数据各个特征的调整最小值: [-1.19444444 -0.83333333 -0.18965517 -0.04166667] 原始数据各个特征的缩放数据值: [ 0.27777778 0.41666667 0.17241379 0.41666667] 对类别判别影响最大的三个特征属性分别是: [ True False True True] [0 2 3] Score: 0.966666666667 Classes: [0 1 2] (100, 100) [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] 最优参数列表: {'skb__k': 2, 'decision__max_depth': 2, 'pca__n_components': 0.5, 'decision__criterion': 'gini'} score值: 0.933333333333 正确率: 1.0 1深度,正确率0.55556 2深度,正确率0.73333 3深度,正确率0.77778 4深度,正确率0.73333 5深度,正确率0.68889 6深度,正确率0.68889 7深度,正确率0.68889 8深度,正确率0.66667 9深度,正确率0.66667 10深度,正确率0.66667 11深度,正确率0.66667 12深度,正确率0.66667 13深度,正确率0.66667 14深度,正确率0.66667