- 1、随机在N个样本中选择一个样本,重复N次(样本是有可能重复的)
- 2、随机在M个特征中选择m个特征,不同于普通的决策树选择信息增益最大或者根据基尼系数等选择特征
- 1、能够有效的运行在大数据集上,精度高
- 2、能够处理高维数据而不需要降维处理
- 3、采用随机采样,训练出的模型的方差小,泛化能力强
- 4、能够评估各个特征在分类问题上的重要性
- 5、对缺失值不敏感
import pandas as pd
df = pd.read_csv("./train_modified.csv")
Disbursed | Existing_EMI | ID | Loan_Amount_Applied | Loan_Tenure_Applied | Monthly_Income | Var4 | Var5 | Age | EMI_Loan_Submitted_Missing | ... | Var2_2 | Var2_3 | Var2_4 | Var2_5 | Var2_6 | Mobile_Verified_0 | Mobile_Verified_1 | Source_0 | Source_1 | Source_2 | |
0 | 0 | 0.0 | ID000002C20 | 300000 | 5 | 20000 | 1 | 0 | 37 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
1 | 0 | 0.0 | ID000004E40 | 200000 | 2 | 35000 | 3 | 13 | 30 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
2 | 0 | 0.0 | ID000007H20 | 600000 | 4 | 22500 | 1 | 0 | 34 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
3 | 0 | 0.0 | ID000008I30 | 1000000 | 5 | 35000 | 3 | 10 | 28 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 0 | 25000.0 | ID000009J40 | 500000 | 2 | 100000 | 3 | 17 | 31 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
5 rows × 51 columns
这里我们以 Disbursed 这一列作为分类结果,从结果上看是二分类问题,观察样本发现样本分布不均匀,这时候对分类的好坏评估就需要使用 AUC 评估参数了
0 19680
1 320
Name: Disbursed, dtype: int64
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
%matplotlib inline
x_columns = [x for x in df.columns if x not in ["Disbursed", "ID"]] # 挑选除了Disbursed、ID这两列的数据
X = df[x_columns]
y = df["Disbursed"]
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# 数据可视化
fig = plt.figure()
plt.scatter(x_train[y_train==0]["Loan_Tenure_Applied"], x_train[y_train==0]["Var4"])
plt.scatter(x_train[y_train==1]["Loan_Tenure_Applied"], x_train[y_train==1]["Var4"])
plt.legend([0, 1])
使用单个决策树CART模型,不调任何参数,观察结果, 发现尽管分类的精度很高,但是在样本分布不均匀的情况下,AUC得分接近0.5,说明这个分类器性能很差
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
accuracy = dtc.score(x_test, y_test)
print("Accuracy (test): \n", accuracy)
y_pred = dtc.predict(x_test)
print("混淆矩阵:\n", classification_report(y_test, y_pred))
y_predprob = dtc.predict_proba(x_test)[:, 1]
print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
Accuracy (test):
precision recall f1-score support
0 0.99 0.98 0.98 4926
1 0.05 0.07 0.06 74
avg / total 0.97 0.97 0.97 5000
AUC Score (test): 0.523336
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)
accuracy = gbc.score(x_test, y_test)
print("Accuracy (test): \n", accuracy)
y_pred = gbc.predict(x_test)
print("混淆矩阵:\n", classification_report(y_test, y_pred))
y_predprob = gbc.predict_proba(x_test)[:, 1]
print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
Accuracy (test):
precision recall f1-score support
0 0.99 1.00 0.99 4926
1 0.00 0.00 0.00 74
avg / total 0.97 0.98 0.98 5000
AUC Score (test): 0.824064
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
rfc = RandomForestClassifier(oob_score=True)
rfc.fit(x_train, y_train)
accuracy = rfc.score(x_test, y_test)
print("Accuracy (test): \n", accuracy)
y_pred = rfc.predict(x_test)
print("混淆矩阵(test):\n", classification_report(y_test, y_pred))
y_predprob = rfc.predict_proba(x_test)[:, 1]
print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
print("袋外分数:\n", rfc.oob_score_)
Accuracy (test):
precision recall f1-score support
0 0.99 1.00 0.99 4926
1 0.50 0.01 0.03 74
avg / total 0.98 0.99 0.98 5000
AUC Score (test): 0.603095
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
rfc = RandomForestClassifier(oob_score=True, max_features="sqrt")
params = {"max_depth": list(range(3,15, 2)),
"n_estimators": list(range(50, 201, 20)),
'min_samples_split': list(range(80,150,20)),
'min_samples_leaf': list(range(10,60,10))}
gs = GridSearchCV(estimator=rfc, param_grid=params, cv=5)
gs.fit(x_train, y_train)
accuracy = gs.score(x_test, y_test)
print("Accuracy (test): \n", accuracy)
y_pred = gs.predict(x_test)
print("混淆矩阵(test):\n", classification_report(y_test, y_pred))
y_predprob = gs.predict_proba(x_test)[:, 1]
print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
Accuracy (test):
precision recall f1-score support
0 0.99 1.00 0.99 4926
1 0.00 0.00 0.00 74
avg / total 0.97 0.99 0.98 5000
AUC Score (test): 0.803139
gs.best_estimator_, gs.best_score_, gs.best_params_
(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=3, max_features='sqrt', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=10, min_samples_split=80,
min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
oob_score=True, random_state=None, verbose=0, warm_start=False),
{'max_depth': 3,
'min_samples_leaf': 10,
'min_samples_split': 80,
'n_estimators': 50})
rfc2 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=3, max_features='sqrt', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=10, min_samples_split=80,
min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
oob_score=True, random_state=None, verbose=0, warm_start=False)
rfc2.fit(x_train, y_train)
accuracy = rfc2.score(x_test, y_test)
print("Accuracy (test): \n", accuracy)
y_pred = rfc2.predict(x_test)
print("混淆矩阵(test):\n", classification_report(y_test, y_pred))
y_predprob = rfc2.predict_proba(x_test)[:, 1]
print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
print("袋外分数:\n", rfc2.oob_score_)
Accuracy (test):
precision recall f1-score support
0 0.98 1.00 0.99 4918
1 0.00 0.00 0.00 82
avg / total 0.97 0.98 0.98 5000
AUC Score (test): 0.808931
