from sklearn import tree clf = tree.DecisionTreeClassifier() clf.fit(features_train, labels_train) pred = clf.predict(features_test) accuracy = clf.score(features_test, labels_test)
min_samples_split :
The minimum number of samples required to split an internal node:
当min_samples_split设为50时,可以一定程度减少过拟合
##决策树编码
def classify(features_train, labels_train): from sklearn import tree clf = tree.DecisionTreeClassifier() clf = clf.fit(features_train, labels_train) return clf
##决策树准确性
from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier() clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = clf.score(features_test, labels_test)
##决策树准确性
from sklearn.ensemble import RandomForestClassifier ##min_samples_split=2 clf = RandomForestClassifier(min_samples_split=2) clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc_min_samples_split_2 = clf.score(features_test, labels_test) ##min_samples_split=50 clf = RandomForestClassifier(min_samples_split=50) clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc_min_samples_split_50 = clf.score(features_test, labels_test)
##熵公式
##信息增益
##第一个邮件 DT:准确率
from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(min_samples_split = 40) clf.fit(features_train, labels_train) pred = clf.predict(features_test) accuracy = clf.score(features_test, labels_test) print(accuracy)
##通过特征选择加速
print(len(features_train[0]))
##更改特征数量
#email_preprocess.py selector = SelectPercentile(f_classif, percentile=1) #percentile=1即1%可用特征 #dt_author_id.py print(len(features_train[0]))