1.案例:这个数据用身高和体重来界定胖瘦。如下文字档(4.tree.txt),三个栏位各代表身高(m)、体重(kg)与胖瘦(thin/fat)。
2.问题:现在有一身高1.5m与体重99kg的数据,请问是胖是瘦呢?
3.数据文档:4.tree.txt,内容如下。
1.5 50 thin
1.5 60 fat
1.6 40 thin
1.6 60 fat
1.7 60 thin
1.7 80 fat
1.8 60 thin
1.8 90 fat
1.9 70 thin
1.9 80 fat
4.Sampe code:
# -*- coding: utf-8 -*-
import numpy as np
import scipy as sp
from sklearn import tree
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
#''' 数据读入 '''
data = []
labels = []
with open("4.tree.txt") as ifile:
for line in ifile:
tokens = line.strip().split(' ')
data.append([float(tk) for tk in tokens[:-1]])
labels.append(tokens[-1])
x = np.array(data)
labels = np.array(labels)
y = np.zeros(labels.shape)
#''' 标签转换为0/1 '''
y[labels=='fat']=1
#''' 拆分训练数据与测试数据 '''
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
#''' 使用信息熵作为划分标准,对决策树进行训练 '''
clf = tree.DecisionTreeClassifier(criterion='entropy')
print(clf)
clf.fit(x_train, y_train)
#''' 把决策树结构写入文件 '''
with open("1.tree.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
#''' 系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大 '''
print(clf.feature_importances_)
#'''测试结果的打印'''
answer = clf.predict(x_train)
print(x_train)
print(answer)
print(y_train)
print(np.mean( answer == y_train))
#'''准确率与召回率'''
precision, recall, thresholds = precision_recall_curve(y_train, clf.predict(x_train))
answer = clf.predict_proba(x)[:,1]
print(classification_report(y, answer, target_names = ['thin', 'fat']))
#进行预测
print clf.predict([[1.5, 99]])
5.结果:
DecisionTreeClassifier(compute_importances=None, criterion=entropy,
max_depth=None, max_features=None, min_density=None,
min_samples_leaf=1, min_samples_split=2, random_state=None,
splitter=best)
[ 0.34436094 0.65563906]
[[ 1.8 90. ]
[ 1.7 80. ]
[ 1.9 70. ]
[ 1.6 60. ]
[ 1.6 40. ]
[ 1.5 50. ]
[ 1.8 60. ]
[ 1.9 80. ]]
[ 1. 1. 0. 1. 0. 0. 0. 1.]
[ 1. 1. 0. 1. 0. 0. 0. 1.]
1.0
precision recall f1-score support
thin 1.00 1.00 1.00 5
fat 1.00 1.00 1.00 5
avg / total 1.00 1.00 1.00 10
[ 1.]