机器学习4----随机森林

时间:2024-02-16 08:28:54

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
data,target=load_iris(return_X_y=True)
data.shape
data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.2)
from sklearn.ensemble import RandomForestClassifier  
# n_estimators=100: 决策树的数量,默认100
# max_samples=None, : 最大样本数
#    - If None (default), then draw `X.shape[0]` samples.
#     - If int, then draw `max_samples` samples.
#     - If float, then draw `max_samples * X.shape[0]` samples. Thus,
#       `max_samples` should be in the interval `(0.0, 1.0]`.
# max_features='sqrt', : 最大特征数
#  - If int, then consider `max_features` features at each split.
#     - If float, then `max_features` is a fraction and
#       `max(1, int(max_features * n_features_in_))` features are considered at each
#       split.
#     - If "auto", then `max_features=sqrt(n_features)`.
#     - If "sqrt", then `max_features=sqrt(n_features)`.
#     - If "log2", then `max_features=log2(n_features)`.
#     - If None, then `max_features=n_features`.
# bootstrap=True, 有放回抽样,自助采样法
rfc=RandomForestClassifier(n_estimators=10,max_features=3,max_samples=100)
rfc.fit(x_train,y_train)
#### 特征重要性
rfc.feature_importances_
# 所有的子决策树
rfc.estimators_
# 遍历每一个子决策树
for m in rfc.estimators_:
    y_=m.predict(x_test)
    print(y_[:10])
rfc.predict(x_test)[:10]
rfc.score(x_train,y_train)
rfc.score(x_test,y_test)