数据来自 UCI 数据集 匹马印第安人糖尿病数据集
载入数据
# -*- coding: utf-8 -*- import pandas as pd import matplotlib matplotlib.rcParams['font.sans-serif']=[u'simHei'] matplotlib.rcParams['axes.unicode_minus']=False from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_breast_cancer data_set = pd.read_csv('pima-indians-diabetes.csv') data = data_set.values[:,:] y = data[:,8] X = data[:,:8] X_train,X_test,y_train,y_test = train_test_split(X,y)
建立决策树,网格搜索微调模型
# In[1] 网格搜索微调模型 pipeline = Pipeline([ ('clf',DecisionTreeClassifier(criterion='entropy')) ]) parameters={ 'clf__max_depth':(3,5,10,15,20,25,30,35,40), 'clf__min_samples_split':(2,3), 'clf__min_samples_leaf':(1,2,3) } #GridSearchCV 用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数。 grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,scoring='f1') grid_search.fit(X_train,y_train) # 获取搜索到的最优参数 best_parameters = grid_search.best_estimator_.get_params() print("最好的F1值为:",grid_search.best_score_) print('最好的参数为:') for param_name in sorted(parameters.keys()): print('t%s: %r' % (param_name,best_parameters[param_name])) # In[2] 输出预测结果并评价 predictions = grid_search.predict(X_test) print(classification_report(y_test,predictions))
最好的F1值为: 0.5573515325670498 最好的参数为: tclf__max_depth: 5 tclf__min_samples_leaf: 1 tclf__min_samples_split: 2
评价模型
# In[2] 输出预测结果并评价 predictions = grid_search.predict(X_test) print(classification_report(y_test,predictions))
precision recall f1-score support
0.0 0.74 0.89 0.81 124
1.0 0.67 0.43 0.52 68
画出决策树
# In[3]打印树 from sklearn import tree feature_name=data_set.columns.values.tolist()[:-1] # 列名称 DT = tree.DecisionTreeClassifier(criterion='entropy',max_depth=5,min_samples_split=2,min_samples_leaf=5) DT.fit(X_train,y_train) ''' # 法一 import pydotplus from sklearn.externals.six import StringIO dot_data = StringIO() tree.export_graphviz(DT,out_file = dot_data,feature_names=feature_name, class_names=["有糖尿病","无病"],filled=True,rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Tree.pdf") print('Visible tree plot saved as pdf.') ''' # 法二 import graphviz #ID3为决策树分类器fit之后得到的模型,注意这里必须在fit后执行,在predict之后运行会报错 dot_data = tree.export_graphviz(DT, out_file=None,feature_names=feature_name,class_names=["有糖尿病","无病"]) # doctest: +SKIP graph = graphviz.Source(dot_data) # doctest: +SKIP #在同级目录下生成tree.pdf文件 graph.render("tree2") # doctest: +SKIP
随机森林
# -*- coding: utf-8 -*- import pandas as pd import matplotlib matplotlib.rcParams['font.sans-serif']=[u'simHei'] matplotlib.rcParams['axes.unicode_minus']=False from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_breast_cancer data_set = pd.read_csv('pima-indians-diabetes.csv') data = data_set.values[:,:] y = data[:,8] X = data[:,:8] X_train,X_test,y_train,y_test = train_test_split(X,y) RF = RandomForestClassifier(n_estimators=10,random_state=11) RF.fit(X_train,y_train) predictions = RF.predict(X_test) print(classification_report(y_test,predictions))
precision recall f1-score support 0.0 0.82 0.91 0.86 126 1.0 0.78 0.61 0.68 66 micro avg 0.81 0.81 0.81 192 macro avg 0.80 0.76 0.77 192 weighted avg 0.80 0.81 0.80 192