Python(Classification)

时间:2024-03-15 13:46:31

Classification(数据含有连续性和分类变量)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix

import warnings
warnings.filterwarnings('ignore')
'''
Attribution Information
age:连续性变量           workclass:分类变量
fnlwgt:连续性变量        education:分类变量
education-num:连续性变量 marital-status:分类变量
occupation:分类变量      relationship:分类变量
race:分类变量            sex:分类变量
captial-gain:连续性变量  captial-loss:连续性变量
hours-per-week:连续性变量native-country:分类变量

Label Information
>50k | <=50k
'''
dataset = []
count_label_1,count_label_2 = 0,0
num = 5000
with open('adult.txt','r') as f:
    for line in f.readlines():
        if '?' in line:
            continue
        attrs = line[0:-1].split(', ')
        if attrs[-1]=='<=50K' and count_label_1<num:
            dataset.append(attrs)
            count_label_1 += 1
        if attrs[-1]=='>50K' and count_label_2<num:
            dataset.append(attrs)
            count_label_2 += 1
        if count_label_1 >= num and count_label_2 >= num:
            break
dataset = np.array(dataset)
print('数据集:rows = ',dataset.shape[0],' cols = ',dataset.shape[1])

for index,value in enumerate(dataset[0]):
    if value.isdigit():
        dataset[:,index] = dataset[:,index].astype(int)
    else:
        encoder = preprocessing.LabelEncoder()
        dataset[:,index]=encoder.fit_transform(dataset[:,index])

dataset = dataset.astype(int)
X,y= dataset[:,0:-1],dataset[:,-1]

x = preprocessing.StandardScaler().fit_transform(X)

#特征选择
clf = ExtraTreesClassifier(n_estimators=500,criterion='gini',max_depth=4,random_state=1)
clf.fit(x,y)
index = np.flipud(np.argsort(clf.feature_importances_))
score = clf.feature_importances_[index]
fig,ax = plt.subplots(figsize=(15,8))
plt.bar(range(len(index)),score,align='center')
plt.xticks(range(len(index)),index)
plt.title('importances of features')
plt.show()

nx = x[:,index[0:10]]
x_train,x_test,y_train,y_test = train_test_split(nx,y,test_size=0.2,random_state=1)

#交叉验证
print('----------CROSS VALIDATION----------')
models = {'SVC':SVC(),'LogisticRegression':LogisticRegression(),'GBT':GradientBoostingClassifier()}
for model in models:
    scores = cross_val_score(models[model],x_train,y_train,scoring='accuracy',cv=10)
    print('{} acc = {}'.format(model,round(scores.mean(),2)))

print('----------GRID SEARCH----------')
params = {
        'C':[0.001,0.01,0.1,1.0,10,100,1000],
        'kernel':['linear','rbf','sigmoid','poly']
    }
clf = GridSearchCV(SVC(random_state=1),params)
clf.fit(x_train,y_train)
print('Best parameters(SVC):\n',clf.best_estimator_)

params = {
        'penalty':['l1','l2'],
        'C':[0.001,0.01,0.1,1.0,10,100,1000]
    }
clf = GridSearchCV(LogisticRegression(random_state=1),params)
clf.fit(x_train,y_train)
print('Best parameters(LogisticRegression):\n',clf.best_estimator_)

params = {
        'learning_rate':[0.001,0.002,0.01,0.02,0.1,0.2],
        'n_estimators':[100,200,300,400,500],
        'max_depth':[4,8,12,16,20]
    }
clf = GridSearchCV(GradientBoostingClassifier(random_state=1),params)
clf.fit(x_train,y_train)
print('Best parameters(GBT):\n',clf.best_estimator_)

Python(Classification)