Classification(数据含有连续性和分类变量)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')
'''
Attribution Information
age:连续性变量 workclass:分类变量
fnlwgt:连续性变量 education:分类变量
education-num:连续性变量 marital-status:分类变量
occupation:分类变量 relationship:分类变量
race:分类变量 sex:分类变量
captial-gain:连续性变量 captial-loss:连续性变量
hours-per-week:连续性变量native-country:分类变量
Label Information
>50k | <=50k
'''
dataset = []
count_label_1,count_label_2 = 0,0
num = 5000
with open('adult.txt','r') as f:
for line in f.readlines():
if '?' in line:
continue
attrs = line[0:-1].split(', ')
if attrs[-1]=='<=50K' and count_label_1<num:
dataset.append(attrs)
count_label_1 += 1
if attrs[-1]=='>50K' and count_label_2<num:
dataset.append(attrs)
count_label_2 += 1
if count_label_1 >= num and count_label_2 >= num:
break
dataset = np.array(dataset)
print('数据集:rows = ',dataset.shape[0],' cols = ',dataset.shape[1])
for index,value in enumerate(dataset[0]):
if value.isdigit():
dataset[:,index] = dataset[:,index].astype(int)
else:
encoder = preprocessing.LabelEncoder()
dataset[:,index]=encoder.fit_transform(dataset[:,index])
dataset = dataset.astype(int)
X,y= dataset[:,0:-1],dataset[:,-1]
x = preprocessing.StandardScaler().fit_transform(X)
#特征选择
clf = ExtraTreesClassifier(n_estimators=500,criterion='gini',max_depth=4,random_state=1)
clf.fit(x,y)
index = np.flipud(np.argsort(clf.feature_importances_))
score = clf.feature_importances_[index]
fig,ax = plt.subplots(figsize=(15,8))
plt.bar(range(len(index)),score,align='center')
plt.xticks(range(len(index)),index)
plt.title('importances of features')
plt.show()
nx = x[:,index[0:10]]
x_train,x_test,y_train,y_test = train_test_split(nx,y,test_size=0.2,random_state=1)
#交叉验证
print('----------CROSS VALIDATION----------')
models = {'SVC':SVC(),'LogisticRegression':LogisticRegression(),'GBT':GradientBoostingClassifier()}
for model in models:
scores = cross_val_score(models[model],x_train,y_train,scoring='accuracy',cv=10)
print('{} acc = {}'.format(model,round(scores.mean(),2)))
print('----------GRID SEARCH----------')
params = {
'C':[0.001,0.01,0.1,1.0,10,100,1000],
'kernel':['linear','rbf','sigmoid','poly']
}
clf = GridSearchCV(SVC(random_state=1),params)
clf.fit(x_train,y_train)
print('Best parameters(SVC):\n',clf.best_estimator_)
params = {
'penalty':['l1','l2'],
'C':[0.001,0.01,0.1,1.0,10,100,1000]
}
clf = GridSearchCV(LogisticRegression(random_state=1),params)
clf.fit(x_train,y_train)
print('Best parameters(LogisticRegression):\n',clf.best_estimator_)
params = {
'learning_rate':[0.001,0.002,0.01,0.02,0.1,0.2],
'n_estimators':[100,200,300,400,500],
'max_depth':[4,8,12,16,20]
}
clf = GridSearchCV(GradientBoostingClassifier(random_state=1),params)
clf.fit(x_train,y_train)
print('Best parameters(GBT):\n',clf.best_estimator_)