Kaggle实战——泰坦尼克生存预测大赛

时间:2022-09-19 15:48:21

In [6]:
import csv
import numpy as np
csv_file_object = csv.reader(open('D:/In/kaggle/Titanic/train.csv', 'rt'))

data=[]
for row in csv_file_object:
data.append(row)
#data = np.array(data)
print (data[0])
print (np.array(data)[0])
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
'Ticket' 'Fare' 'Cabin' 'Embarked']
In [ ]:
'''
#数据处理
import numpy as np
import pandas as pd
#绘图
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#各种模型、数据处理方法
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import warnings
warnings.filterwarnings('ignore')
'''
In [2]:
print (data[:3])    #list是一维的,array是二维的
print (np.array(data)[:3])
print (np.array(data)[:15,5])
#print (data[0:15,5])
data=np.array(data)
type(data) #data此时为一个二维的数组
[['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], ['1', '0', '3', 'Braund, Mr. Owen Harris', 'male', '22', '1', '0', 'A/5 21171', '7.25', '', 'S'], ['2', '1', '1', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'female', '38', '1', '0', 'PC 17599', '71.2833', 'C85', 'C']]
[['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
'Ticket' 'Fare' 'Cabin' 'Embarked']
['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
'7.25' '' 'S']
['2' '1' '1' 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
'female' '38' '1' '0' 'PC 17599' '71.2833' 'C85' 'C']]
['Age' '22' '38' '26' '35' '35' '' '54' '2' '27' '14' '4' '58' '20' '39']
Out[2]:
numpy.ndarray
In [3]:
print(data[1:6,5])
print(data[1:6,5].astype(int))
print(data[1:6,5].astype(int).mean())
['22' '38' '26' '35' '35']
[22 38 26 35 35]
31.2
In [1]:
import pandas as pd
%matplotlib inline
df=pd.read_csv('D:/In/kaggle/Titanic/train.csv')
df_test=pd.read_csv('D:/In/kaggle/Titanic/test.csv')
print(df.info())
print(df[['Age','Sex','Pclass']][:10])
print(df[df['Age']>60][['Survived','Pclass','Sex','Age']])
df[df['Age'].isnull()][:10] #只显示年龄为空的数据
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
Age Sex Pclass
0 22.0 male 3
1 38.0 female 1
2 26.0 female 3
3 35.0 female 1
4 35.0 male 3
5 NaN male 3
6 54.0 male 1
7 2.0 male 3
8 27.0 female 3
9 14.0 female 2
Survived Pclass Sex Age
33 0 2 male 66.0
54 0 1 male 65.0
96 0 1 male 71.0
116 0 3 male 70.5
170 0 1 male 61.0
252 0 1 male 62.0
275 1 1 female 63.0
280 0 3 male 65.0
326 0 3 male 61.0
438 0 1 male 64.0
456 0 1 male 65.0
483 1 3 female 63.0
493 0 1 male 71.0
545 0 1 male 64.0
555 0 1 male 62.0
570 1 2 male 62.0
625 0 1 male 61.0
630 1 1 male 80.0
672 0 2 male 70.0
745 0 1 male 70.0
829 1 1 female 62.0
851 0 3 male 74.0
Out[1]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q
29 30 0 3 Todoroff, Mr. Lalio male NaN 0 0 349216 7.8958 NaN S
31 32 1 1 Spencer, Mrs. William Augustus (Marie Eugenie) female NaN 1 0 PC 17569 146.5208 B78 C
32 33 1 3 Glynn, Miss. Mary Agatha female NaN 0 0 335677 7.7500 NaN Q
36 37 1 3 Mamee, Mr. Hanna male NaN 0 0 2677 7.2292 NaN C
42 43 0 3 Kraeff, Mr. Theodor male NaN 0 0 349253 7.8958 NaN C
In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(context="paper", font="monospace")
sns.set(style="white")
f, ax = plt.subplots(figsize=(10,6))
train_corr = df.drop('PassengerId',axis=1).corr()
sns.heatmap(train_corr, ax=ax, vmax=.9, square=True)
ax.set_xticklabels(train_corr.index, size=15)
ax.set_yticklabels(train_corr.columns[::1], size=15)
ax.set_title('train feature corr', fontsize=20)
Out[2]:
<matplotlib.text.Text at 0x18f56089128>
Kaggle实战——泰坦尼克生存预测大赛Kaggle实战——泰坦尼克生存预测大赛
In [5]:
for i in range(1,4):
print (i, len (df[ (df['Sex']=='male')&(df['Pclass'] == i) ]) ) #输出不同等级仓中男士的数量
1 122
2 108
3 347
In [6]:
import pylab as p
df['Age'].dropna().hist(range=(0,100),bins=19,alpha=0.8),p.show() #bins代表直方柱的个数 alpha控制颜色深浅
  Kaggle实战——泰坦尼克生存预测大赛
Out[6]:
(<matplotlib.axes._subplots.AxesSubplot at 0x1d7d360a550>, None)
In [7]:
from scipy import stats
fig,axes=plt.subplots(2,1,figsize=(8,6))
sns.set_style('darkgrid')#设置风格主题
sns.distplot(df.Age.fillna(-20),rug=True,color='b',ax=axes[0])#rug强度(齿)
ax0=axes[0]
ax0.set_xlabel('')

ax1=axes[1]
ax1.set_title('age survived distribution')
k1=sns.distplot(df[df.Survived==0].Age.fillna(-20),hist=False,color='r',ax=ax1,label='dead')#罹难的年龄分布
k2=sns.distplot(df[df.Survived==1].Age.fillna(-20),hist=False,color='g',ax=ax1,label='alive')#存活的年龄分布
ax1.set_xlabel('')#x坐标轴名字

ax1.legend(fontsize=16)#小朋友和中青年比较容易存活
Out[7]:
<matplotlib.legend.Legend at 0x227fa3b1320>
  Kaggle实战——泰坦尼克生存预测大赛
In [8]:
f,ax=plt.subplots(figsize=(8,3))
ax.set_title('Sex Age dist',size=20)
sns.distplot(df[df.Sex=='female'].dropna().Age,hist=False,color='pink',label='female')
sns.distplot(df[df.Sex=='male'].dropna().Age,hist=False,color='blue',label='male')
ax.legend(fontsize=15)#训练集中的男女年龄分布 男性中老年较多 女性较年轻
Out[8]:
<matplotlib.legend.Legend at 0x227fa2ded68>
  Kaggle实战——泰坦尼克生存预测大赛
In [16]:
f,ax=plt.subplots(figsize=(8,3))
plt.ylim(0.0,0.03)
ax.set_title('Pclass Age dist',size=20)
sns.distplot(df[df.Pclass==1].dropna().Age,hist=False,color='pink',label='P1')
sns.distplot(df[df.Pclass==2].dropna().Age,hist=False,color='blue',label='P2')
sns.distplot(df[df.Pclass==3].dropna().Age,hist=False,color='green',label='P3')
ax.legend(fontsize=15)#不同仓级年龄分布
Out[16]:
<matplotlib.legend.Legend at 0x227fba1cac8>
  Kaggle实战——泰坦尼克生存预测大赛
In [35]:
y_dead=df[df.Survived==0].groupby('Pclass')['Survived'].count()
y_alive=df[df.Survived==1].groupby('Pclass')['Survived'].count()
pos=[1,2,3]#横轴id
ax=plt.figure(figsize=(8,4)).add_subplot(111)
ax.bar(pos,y_dead,color='r',alpha=0.6,label='dead')
ax.bar(pos,y_alive,color='g',bottom=y_dead,alpha=0.6,label='alive')
ax.legend(fontsize=16,loc='best')
ax.set_xticks(pos)
ax.set_xticklabels(['Pclass%d'%(i) for i in range(1,4)],size=15)#x坐标轴信息
ax.set_title('Pclass Survived count',size=20)#不同仓级存活情况
Out[35]:
<matplotlib.text.Text at 0x227fd256400>
  Kaggle实战——泰坦尼克生存预测大赛
In [29]:
pos=range(0,6)
age_list=[]
for Pclass_ in range(1,4):
for Survived_ in range(0,2):
age_list.append(df[(df.Pclass==Pclass_)&(df.Survived==Survived_)].Age.values)
#三个仓级的存亡年龄
fig,axes=plt.subplots(3,1,figsize=(10,6))
sns.set_style('darkgrid')#设置风格主题
#plt.ylim(0.0,0.06)
#print(axes)
print(len(age_list))
i_Pclass=1
for ax in axes:
if i_Pclass==1:
ax.set_ylim(0.0, 0.03)#设置y轴范围
sns.distplot(age_list[i_Pclass*2-2],hist=False,ax=ax,label='Pclass:%d,survived:0'%(i_Pclass),color='r')
sns.distplot(age_list[i_Pclass*2-1],hist=False,ax=ax,label='Pclass:%d,survived:1'%(i_Pclass),color='g')
i_Pclass+=1
ax.set_xlabel('age',size=15)
ax.legend(fontsize=15)
6
D:\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:454: RuntimeWarning: invalid value encountered in greater
X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
D:\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:454: RuntimeWarning: invalid value encountered in less
X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
Kaggle实战——泰坦尼克生存预测大赛
In [33]:
#性别
print(df.Sex.value_counts())
print('******************************')
print(df.groupby('Sex')['Survived'].mean())#男女存活率
male      577
female 314
Name: Sex, dtype: int64
******************************
Sex
female 0.742038
male 0.188908
Name: Survived, dtype: float64
In [36]:
ax=plt.figure(figsize=(10,4)).add_subplot(111)
sns.violinplot(x='Sex',y='Age',hue='Survived',data=df.dropna(),split=True)#小提琴图
ax.set_xlabel('Sex',size=20)
ax.set_xticklabels(['Female','male'],size=18)
ax.set_ylabel('Age',size=20)
ax.legend(fontsize=25,loc='best')#男女存亡年龄分布
Out[36]:
<matplotlib.legend.Legend at 0x18f568b1f60>
  Kaggle实战——泰坦尼克生存预测大赛
In [42]:
label=[]
for sex_i in ['female','male']:
for pclass_i in range(1,4):
label.append('sex:%s,Pclass:%d'%(sex_i,pclass_i))

pos=range(6)
fig=plt.figure(figsize=(16,4))
ax=fig.add_subplot(111)
ax.bar(pos,df[df['Survived']==0].groupby(['Sex','Pclass'])['Survived'].count().values,
color='r',
alpha=0.5,
align='center',
tick_label=label,
label='dead')
ax.bar(pos,
df[df['Survived']==1].groupby(['Sex','Pclass'])['Survived'].count().values,
bottom=df[df['Survived']==0].groupby(['Sex','Pclass'])['Survived'].count().values,
color='g',
alpha=0.5,
align='center',
tick_label=label,
label='alive')
ax.tick_params(labelsize=15)
ax.set_title('sex_pclass_survived',size=30)
ax.legend(fontsize=15,loc='best')#相同性别情况下,仓级越高越容易存活
Out[42]:
<matplotlib.legend.Legend at 0x18f56eb40f0>
  Kaggle实战——泰坦尼克生存预测大赛
In [69]:
#Fare费用
fig=plt.figure(figsize=(8,6))
ax=plt.subplot2grid((2,2),(0,0),colspan=2)#角标
#fig,ax=plt.subplots(1,1,figsize=(8,6))
ax.tick_params(labelsize=15)
ax.set_title('Fare dist',size=20)
ax.set_ylabel('dist',size=20)
sns.kdeplot(df.Fare,ax=ax)
sns.distplot(df.Fare,hist=True,ax=ax)
ax.legend(fontsize=15)
pos=range(0,400,50)
ax.set_xticks(pos)
ax.set_xlim([0,200])
ax.set_xlabel('')

#fig,ax1=plt.subplots(1,1,figsize=(8,6))
ax1=plt.subplot2grid((2,2),(1,0),colspan=2)
ax1.set_title('Fare Pclass dist',size=20)
for i in range(1,4):
sns.kdeplot(df[df.Pclass==i].Fare,ax=ax1,label='Pclass %d'%(i))#不同仓级的票价分布
ax1.set_xlim([0,200])
ax1.set_ylim([0,0.15])
ax1.legend(fontsize=15)#船票价分布
plt.tight_layout()#间距松紧
  Kaggle实战——泰坦尼克生存预测大赛
In [70]:
fig=plt.figure(figsize=(8,3))
ax1=fig.add_subplot(111)
sns.kdeplot(df[df.Survived==0].Fare,ax=ax1,label='dead',color='r')
sns.kdeplot(df[df.Survived==1].Fare,ax=ax1,label='alive',color='g')
#sns.distplot(df[df.Survived==0].Fare,ax=ax1,color='r')
#sns.distplot(df[df.Survived==1].Fare,ax=ax1,color='g')
ax1.set_xlim([0,300])
ax1.legend(fontsize=15)
ax1.set_title('Fare survived',size=20)
ax1.set_xlabel('Fare',size=15)#存亡票价分布
Out[70]:
<matplotlib.text.Text at 0x18f5b446400>
  Kaggle实战——泰坦尼克生存预测大赛
In [73]:
fig=plt.figure(figsize=(8,4))
ax1=fig.add_subplot(211)
sns.countplot(df.SibSp)#计数
ax1.set_title('SibSp',size=20)
ax2=fig.add_subplot(212,sharex=ax1)
sns.countplot(df.Parch)
ax2.set_title('Parch',size=20)#表亲和直亲
#plt.tight_layout()
Out[73]:
<matplotlib.text.Text at 0x18f5ba0a2e8>
  Kaggle实战——泰坦尼克生存预测大赛
In [76]:
fig=plt.figure(figsize=(10,6))
ax1=fig.add_subplot(311)
df.groupby('SibSp')['Survived'].mean().plot(kind='bar',ax=ax1)#存活率
ax1.set_title('Sibsp Survived Rate',size=16)
ax1.set_xlabel('')

ax2=fig.add_subplot(312)
df.groupby('Parch')['Survived'].mean().plot(kind='bar',ax=ax2)
ax2.set_title('Parch Survived Rate',size=16)
ax2.set_xlabel('')

ax3=fig.add_subplot(313)
df.groupby(df.SibSp+df.Parch)['Survived'].mean().plot(kind='bar',ax=ax3)
ax3.set_title('Parch+Sibsp Survived Rate',size=16)
#plt.tight_layout()
Out[76]:
<matplotlib.text.Text at 0x18f5cf92d30>
  Kaggle实战——泰坦尼克生存预测大赛
In [85]:
#上船地点
plt.style.use('ggplot')#美化
ax=plt.figure(figsize=(8,3)).add_subplot(111)
pos=[1,2,3]
y1=df[df.Survived==0].groupby('Embarked')['Survived'].count().sort_index().values#确保存亡的一一对应
print(y1)
y2=df[df.Survived==1].groupby('Embarked')['Survived'].count().sort_index().values
ax.bar(pos,y1,color='r',alpha=0.4,align='center',label='dead')
ax.bar(pos,y2,color='g',alpha=0.4,align='center',label='alive',bottom=y1)
ax.set_xticks(pos)
ax.set_xticklabels(['C','Q','S'])
ax.legend(fontsize=15,loc='best')
ax.set_title('Embarked survived count',size=18)
[ 75  47 427]
Out[85]:
<matplotlib.text.Text at 0x18f5d3b8e10>
  Kaggle实战——泰坦尼克生存预测大赛
In [94]:
#C地存活概率较高
#不同的上船地点
ax=plt.figure(figsize=(8,3)).add_subplot(111)
ax.set_xlim([-20,80])
ax.set_ylim([0.0,0.03])
sns.kdeplot(df[df.Embarked=='C'].Age.fillna(-10),ax=ax,label='C',color='r')
sns.kdeplot(df[df.Embarked=='Q'].Age.fillna(-10),ax=ax,label='Q',color='b')
sns.kdeplot(df[df.Embarked=='S'].Age.fillna(-10),ax=ax,label='S',color='g')
ax.legend(fontsize=18)
ax.set_title('Embarked Age Dist',size=18)
#plt.tight_layout()
#Q上岸的年龄缺失比较多
#C和S上岸的年龄分布较相似,但是C的分布更扁平小孩和老人的占比更高
Out[94]:
<matplotlib.text.Text at 0x18f5da920b8>
  Kaggle实战——泰坦尼克生存预测大赛
In [103]:
#不同仓位不同地点
y1=df[df.Survived==0].groupby(['Embarked','Pclass'])['Survived'].count().reset_index()['Survived'].values
print(y1)
y2=df[df.Survived==1].groupby(['Embarked','Pclass'])['Survived'].count().reset_index()['Survived'].values

ax=plt.figure(figsize=(8,3)).add_subplot(111)
pos=range(9)
ax.bar(pos,y1,align='center',alpha=0.5,color='r',label='dead')
ax.bar(pos,y2,align='center',bottom=y1,alpha=0.5,color='g',label='alive')

ax.set_xticks(pos)
xticklabels=[]
for embarked_val in ['C','Q','S']:
for pclass_val in range(1,4):
xticklabels.append('%s/%d'%(embarked_val,pclass_val))

ax.set_xticklabels(xticklabels,size=15)
ax.legend(fontsize=15,loc='best')#C地的存活率似乎更高
[ 26   8  41   1   1  45  53  88 286]
Out[103]:
<matplotlib.legend.Legend at 0x18f5b53ecf8>
  Kaggle实战——泰坦尼克生存预测大赛
In [123]:
#Cabin船舱号
print(df['Cabin'].isnull().value_counts())
df.groupby(df['Cabin'].isnull())['Survived'].mean()
#船舱号为空的存活率低,可以作为一个特征
True     687
False 204
Name: Cabin, dtype: int64
Out[123]:
Cabin
False 0.666667
True 0.299854
Name: Survived, dtype: float64
In [148]:
print(df[df['PassengerId']==28]['Cabin'])
print(len(df.loc[27,'Cabin']))
df[df.Cabin.apply(lambda x:len(x) if (x is not np.nan) else 0)>4].head(10)#返回Cabin大于4个字符的(有多个船舱的)
27    C23 C25 C27
Name: Cabin, dtype: object
11
Out[148]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
27 28 0 1 Fortune, Mr. Charles Alexander male 19.00 3 2 19950 263.0000 C23 C25 C27 S
75 76 0 3 Moen, Mr. Sigurd Hansen male 25.00 0 0 348123 7.6500 F G73 S
88 89 1 1 Fortune, Miss. Mabel Helen female 23.00 3 2 19950 263.0000 C23 C25 C27 S
97 98 1 1 Greenfield, Mr. William Bertram male 23.00 0 1 PC 17759 63.3583 D10 D12 C
118 119 0 1 Baxter, Mr. Quigg Edmond male 24.00 0 1 PC 17558 247.5208 B58 B60 C
128 129 1 3 Peter, Miss. Anna female NaN 1 1 2668 22.3583 F E69 C
297 298 0 1 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S
299 300 1 1 Baxter, Mrs. James (Helene DeLaudeniere Chaput) female 50.00 0 1 PC 17558 247.5208 B58 B60 C
305 306 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S
311 312 1 1 Ryerson, Miss. Emily Borie female 18.00 2 2 PC 17608 262.3750 B57 B59 B63 B66 C
In [149]:
#不同船舱的存亡统计
df['Cabin_Zone']=df.Cabin.fillna('0').str.split(' ').apply(lambda x: x[0][0])
df.groupby(by='Cabin_Zone')['Survived'].agg(['mean','count'])
#不同船舱的存亡率不一样
Out[149]:
  mean count
Cabin_Zone    
0 0.299854 687
A 0.466667 15
B 0.744681 47
C 0.593220 59
D 0.757576 33
E 0.750000 32
F 0.615385 13
G 0.500000 4
T 0.000000 1
In [155]:
#船票Ticket
print(df.Ticket.head())
print(len(df.Ticket.unique()))#船票有重复的
df[df.Ticket=='110152']
0           A/5 21171
1 PC 17599
2 STON/O2. 3101282
3 113803
4 373450
Name: Ticket, dtype: object
681
Out[155]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Cabin_Zone
257 258 1 1 Cherry, Miss. Gladys female 30.0 0 0 110152 86.5 B77 S B
504 505 1 1 Maioni, Miss. Roberta female 16.0 0 0 110152 86.5 B79 S B
759 760 1 1 Rothes, the Countess. of (Lucy Noel Martha Dye... female 33.0 0 0 110152 86.5 B77 S B
In [165]:
#船票有重复的
print(df[df.Cabin=='B77'])

#有些船票有英文,有些则没有,使用正则!!!!!!
import re
def find_e_word(x):
pattern=re.compile('[a-z]|[A-Z]')
try:
re.search(pattern,x).group()
return 1
except:
return 0

df['Ticket_e']=df.Ticket.apply(lambda x: find_e_word(x))
df.groupby('Ticket_e')['Survived'].mean()
#存活率没区别
     PassengerId  Survived  Pclass  \
257 258 1 1
759 760 1 1

Name Sex Age SibSp \
257 Cherry, Miss. Gladys female 30.0 0
759 Rothes, the Countess. of (Lucy Noel Martha Dye... female 33.0 0

Parch Ticket Fare Cabin Embarked Cabin_Zone Ticket_e
257 0 110152 86.5 B77 S B 0
759 0 110152 86.5 B77 S B 0
Out[165]:
Ticket_e
0 0.384266
1 0.382609
Name: Survived, dtype: float64
In [174]:
#名字Name
print(df.Name.apply(lambda x: x.split(',')[1].split('.')[0]).value_counts())
df.Name.apply(lambda x: x.split(',')[1].split('.')[1]).value_counts()[:8]
 Mr              517
Miss 182
Mrs 125
Master 40
Dr 7
Rev 6
Major 2
Col 2
Mlle 2
Capt 1
the Countess 1
Don 1
Ms 1
Lady 1
Mme 1
Jonkheer 1
Sir 1
Name: Name, dtype: int64
Out[174]:
 John             9
James 7
Mary 6
William 6
Ivan 4
Bertha 4
William Henry 4
William John 4
Name: Name, dtype: int64
In [ ]:
#--------------------------------
In [183]:
df.head()
Out[183]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Cabin_Zone Ticket_e
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C C 1
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 0 1
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S C 0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 0 0
In [177]:
#查看数据缺失情况
print(df.isnull().sum())
df_test.isnull().sum()
PassengerId      0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
Cabin_Zone 0
Ticket_e 0
dtype: int64
Out[177]:
PassengerId      0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
In [179]:
df[df['Embarked'].isnull()]
Out[179]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Cabin_Zone Ticket_e
61 62 1 1 Icard, Miss. Amelie female 38.0 0 0 113572 80.0 B28 NaN B 0
829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) female 62.0 0 0 113572 80.0 B28 NaN B 0
In [181]:
print(df['Embarked'].value_counts())
print(df[df['Pclass']==1].Embarked.value_counts())
df.Embarked.fillna('S',inplace=True)
#上船地点填充
S    644
C 168
Q 77
Name: Embarked, dtype: int64
Out[181]:
S    127
C 85
Q 2
Name: Embarked, dtype: int64
In [234]:
#Cabin缺失值的处理,方法一
df['Cabin_e']=df['Cabin'].isnull().map({True:0,False:1})
df_test['Cabin_e']=df_test['Cabin'].isnull().map({True:0,False:1})
#df=df.drop(['Cabin_e'],axis=1)


#df['Cabin_e']=df['Cabin'].isnull().map(lambda x:0 if x is True else 1)方法二!!!!!!
#df_test['Cabin_e']
#df.head()
#df=df.drop(['Cabin_e'],axis=1)

"""方法三
import re
def Cabin_isnull(x):
pattern=re.compile("\d$")
try:
re.search(pattern,x).group()
return 1
except:
return 0
df['Cabin_e']=df['Cabin'].apply(lambda x: Cabin_isnull(x))
df.head()
"""
df_test.head()
Out[234]:
  PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Cabin_e
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S 0
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q 0
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S 0
In [8]:
df['Gender']=5
df['Gender']=df['Sex'].map(lambda x:x[0].upper())
print(df.head())
df['Gender']=df['Sex'].map({'male':1,'female':0}) #并行化执行
df.head()
   PassengerId  Survived  Pclass  \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3

Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0

Parch Ticket Fare Cabin Embarked Gender
0 0 A/5 21171 7.2500 NaN S M
1 0 PC 17599 71.2833 C85 C F
2 0 STON/O2. 3101282 7.9250 NaN S F
3 0 113803 53.1000 C123 S F
4 0 373450 8.0500 NaN S M
Out[8]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Gender
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1
In [236]:
#Age年龄离散化,以5岁为一个周期
def age_map(x):
if x<10:
return '10-'
if x<60:
return '%d-%d'%(x//5*5,x//5*5+5)
elif x>=60:
return '60+'
else:
return 'Null'
df['Age_map']=df['Age'].apply(lambda x: age_map(x))
df_test['Age_map']=df_test['Age'].apply(lambda x:age_map(x))
df.groupby('Age_map')['Survived'].agg(['count','mean'])#不同年龄层的存亡情况
Out[236]:
  count mean
Age_map    
10- 62 0.612903
10-15 16 0.437500
15-20 86 0.395349
20-25 114 0.342105
25-30 106 0.358491
30-35 95 0.421053
35-40 72 0.458333
40-45 48 0.375000
45-50 41 0.390244
50-55 32 0.437500
55-60 16 0.375000
60+ 26 0.269231
Null 177 0.293785
In [247]:
#test中的Fare缺失
print(df_test[df_test['Fare'].isnull()])
df_test.loc[df_test.Fare.isnull(),'Fare']=df_test[(df_test['Pclass']==3)&(df_test['Embarked']=='S')&(df_test['Sex']=='male')].dropna().Fare.mean()
df_test[df_test['PassengerId']==1044]
Empty DataFrame
Columns: [PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked, Cabin_e, Age_map]
Index: []
Out[247]:
  PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Cabin_e Age_map
152 1044 3 Storey, Mr. Thomas male 60.5 0 0 3701 7.65 NaN S 0 60+
In [266]:
#数据归一化以加速模型收敛,Fare分布太宽
import sklearn.preprocessing as preprocessing
scaler=preprocessing.StandardScaler()
fare_scale_param=scaler.fit(df['Fare'].values.reshape(-1,1))

df['Fare_e']=fare_scale_param.transform(df['Fare'].values.reshape(-1,1))
df_test['Fare_e']=fare_scale_param.transform(df_test['Fare'].values.reshape(-1,1))
print(df.head())
df_test.head()
   PassengerId  Survived  Pclass  \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3

Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0

Parch Ticket Fare Cabin Embarked Cabin_Zone Ticket_e \
0 0 A/5 21171 7.2500 NaN S 0 1
1 0 PC 17599 71.2833 C85 C C 1
2 0 STON/O2. 3101282 7.9250 NaN S 0 1
3 0 113803 53.1000 C123 S C 0
4 0 373450 8.0500 NaN S 0 0

Cabin_e Age_map Fare_e
0 0 20-25 -0.502445
1 1 35-40 0.786845
2 0 25-30 -0.488854
3 1 35-40 0.420730
4 0 35-40 -0.486337
Out[266]:
  PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Cabin_e Age_map Fare_e
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q 0 30-35 -0.490783
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S 0 45-50 -0.507479
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q 0 60+ -0.453367
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S 0 25-30 -0.474005
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S 0 20-25 -0.401017
In [267]:
#部分变量哑编码,onehot独热码
df_x = pd.concat([df[['SibSp','Parch','Fare']], pd.get_dummies(df[['Pclass','Sex','Cabin','Embarked','Age_map']])],axis=1)#按照列黏连
df_y = df.Survived
df_test_x = pd.concat([df_test[['SibSp','Parch','Fare']], pd.get_dummies(df_test[['Pclass', 'Sex','Cabin','Embarked', 'Age_map']])],axis=1)
print(df_x.head())
df_test_x.head()
   SibSp  Parch     Fare  Pclass  Sex_female  Sex_male  Cabin_A10  Cabin_A14  \
0 1 0 7.2500 3 0 1 0 0
1 1 0 71.2833 1 1 0 0 0
2 0 0 7.9250 3 1 0 0 0
3 1 0 53.1000 1 1 0 0 0
4 0 0 8.0500 3 0 1 0 0

Cabin_A16 Cabin_A19 ... Age_map_20-25 Age_map_25-30 \
0 0 0 ... 1 0
1 0 0 ... 0 0
2 0 0 ... 0 1
3 0 0 ... 0 0
4 0 0 ... 0 0

Age_map_30-35 Age_map_35-40 Age_map_40-45 Age_map_45-50 Age_map_50-55 \
0 0 0 0 0 0
1 0 1 0 0 0
2 0 0 0 0 0
3 0 1 0 0 0
4 0 1 0 0 0

Age_map_55-60 Age_map_60+ Age_map_Null
0 0 0 0
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0

[5 rows x 169 columns]
Out[267]:
  SibSp Parch Fare Pclass Sex_female Sex_male Cabin_A11 Cabin_A18 Cabin_A21 Cabin_A29 ... Age_map_20-25 Age_map_25-30 Age_map_30-35 Age_map_35-40 Age_map_40-45 Age_map_45-50 Age_map_50-55 Age_map_55-60 Age_map_60+ Age_map_Null
0 0 0 7.8292 3 0 1 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
1 1 0 7.0000 3 1 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
2 0 0 9.6875 2 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 0 0 8.6625 3 0 1 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
4 1 1 12.2875 3 1 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0

5 rows × 98 columns

In [ ]:
#缺失年龄填补
In [9]:
M=np.random.randn(2,3)      #i代表性别 j代表仓等级
for i in range(2):
for j in range(3):
M[i][j]=df[(df['Pclass']==j+1) & (df['Gender']==i)]['Age'].median() #取中位数
M
Out[9]:
array([[ 35. ,  28. ,  21.5],
[ 40. , 30. , 25. ]])
In [10]:
df['AgeFill']=df['Age']
print(len(df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']]))
print(df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']].head())
for i in range(2):
for j in range(3):
df.loc[df[(df['Pclass']==j+1) & (df['Gender']==i) & (df['Age'].isnull())].index,['AgeFill']]=M[i][j] #.index取索引值
df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']].head()
177
Gender Pclass Age AgeFill
5 1 3 NaN NaN
17 1 2 NaN NaN
19 0 3 NaN NaN
26 1 3 NaN NaN
28 0 3 NaN NaN
Out[10]:
  Gender Pclass Age AgeFill
5 1 3 NaN 25.0
17 1 2 NaN 30.0
19 0 3 NaN 21.5
26 1 3 NaN 25.0
28 0 3 NaN 21.5
In [11]:
#--------------特征工程
df['familysize']=df['SibSp']+df['Parch']
df['Pclass*AgeFill']=df['Pclass']*df['AgeFill']
In [12]:
for i in df.dtypes:
print (i)
int64
int64
int64
object
object
float64
int64
int64
object
float64
object
object
int64
float64
int64
float64
In [13]:
#df=df.drop(['Pclass*Age'],axis=1) 丢弃一列
print(df.dtypes)
df.dtypes[df.dtypes.map(lambda x: x=='object')]
PassengerId         int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
Gender int64
AgeFill float64
familysize int64
Pclass*AgeFill float64
dtype: object
Out[13]:
Name        object
Sex object
Ticket object
Cabin object
Embarked object
dtype: object
In [14]:
df=df.drop(['Name','Sex','Age','Ticket','Cabin','Embarked'],axis=1)
df.head()
Out[14]:
  PassengerId Survived Pclass SibSp Parch Fare Gender AgeFill familysize Pclass*AgeFill
0 1 0 3 1 0 7.2500 1 22.0 1 66.0
1 2 1 1 1 0 71.2833 0 38.0 1 38.0
2 3 1 3 0 0 7.9250 0 26.0 0 78.0
3 4 1 1 1 0 53.1000 0 35.0 1 35.0
4 5 0 3 0 0 8.0500 1 35.0 0 105.0
In [15]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn import preprocessing
import seaborn as sns
feature_names=['Pclass','SibSp','Parch','Fare','Gender','AgeFill','familysize','Pclass*AgeFill']
X=df[feature_names]
Y=df['Survived']
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0) #7:3拆分  
lr_model=LogisticRegression()
lr_model.fit(X_train,y_train)
y_pred_score=lr_model.predict_proba(X_test)
y_pred_score[:10]
D:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
Out[15]:
array([[ 0.857523  ,  0.142477  ],
[ 0.86057444, 0.13942556],
[ 0.9224654 , 0.0775346 ],
[ 0.10549417, 0.89450583],
[ 0.41466724, 0.58533276],
[ 0.59148554, 0.40851446],
[ 0.0714005 , 0.9285995 ],
[ 0.06661603, 0.93338397],
[ 0.6030931 , 0.3969069 ],
[ 0.28987151, 0.71012849]])
In [16]:
#from sklearn.metrics import roc_curve
import sklearn
import matplotlib.pyplot as plt
fpr,tpr,thresholds=sklearn.metrics.roc_curve(y_test,y_pred_score[:,1])#注意阈值  
roc_auc=sklearn.metrics.auc(fpr,tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr,'b',label='AUC = %0.2f'%roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
D:\Anaconda3\lib\site-packages\matplotlib\legend.py:326: UserWarning: Unrecognized location "lower right". Falling back on "best"; valid locations are
best
upper right
upper left
lower left
lower right
right
center left
center right
lower center
upper center
center

six.iterkeys(self.codes))))
  Kaggle实战——泰坦尼克生存预测大赛
In [17]:
df=df.dropna()    #有空值存在的所有行
train_data=df.values
train_data
Out[17]:
array([[   1. ,    0. ,    3. , ...,   22. ,    1. ,   66. ],
[ 2. , 1. , 1. , ..., 38. , 1. , 38. ],
[ 3. , 1. , 3. , ..., 26. , 0. , 78. ],
...,
[ 889. , 0. , 3. , ..., 21.5, 3. , 64.5],
[ 890. , 1. , 1. , ..., 26. , 0. , 26. ],
[ 891. , 0. , 3. , ..., 32. , 0. , 96. ]])
In [18]:
#使用网格搜索最佳模型参数!!!!!! X_train,X_test,y_train,y_test
from sklearn.model_selection import GridSearchCV
base_line_model = LogisticRegression()
param = {'penalty':['l1','l2'],
'C':[0.1, 0.5, 1.0,5.0]}
grd = GridSearchCV(estimator=base_line_model, param_grid=param, cv=5, n_jobs=3)
grd.fit(X_train,y_train)
grd.best_estimator_
Out[18]:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
In [27]:
from sklearn.model_selection import learning_curve
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)
def plot_learning_curve(clf, title, X, y, ylim=None, cv=None, n_jobs=3, train_sizes=np.linspace(.05, 1., 5)):
train_sizes, train_scores, test_scores = learning_curve(
clf, X, y, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

ax = plt.figure().add_subplot(111)
ax.set_title(title)
if ylim is not None:
ax.ylim(*ylim)
ax.set_xlabel(u"train_num_of_samples")
ax.set_ylabel(u"score")

ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b")
ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
ax.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"train score")
ax.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"testCV score")

ax.legend(loc="best")

midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff

plot_learning_curve(grd, u"learning_rate", X_train, y_train)
D:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:597: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.
% (min_groups, self.n_splits)), Warning)
Out[27]:
(0.79962671354141879, 0.069011468383558872)
In [28]:
plt.show()
  Kaggle实战——泰坦尼克生存预测大赛
In [44]:
#from sklearn.metrics import roc_curve
import sklearn
import matplotlib.pyplot as plt
fpr,tpr,thresholds=sklearn.metrics.roc_curve(y_test,grd.predict_proba(X_test)[:,1],pos_label=1)#grd可以预测
roc_auc=sklearn.metrics.auc(fpr,tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr,'b',label='AUC = %0.2f'%roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
D:\Anaconda3\lib\site-packages\matplotlib\legend.py:326: UserWarning: Unrecognized location "lower right". Falling back on "best"; valid locations are
best
upper right
upper left
lower left
lower right
right
center left
center right
lower center
upper center
center

six.iterkeys(self.codes))))
  Kaggle实战——泰坦尼克生存预测大赛
In [45]:
#使用grd模型生成预测结果 并存入CSV
#df_test=pd.read_csv('D:/In/kaggle/Titanic/test.csv')
gender_submission = pd.DataFrame({'PassengerId':X_test.index,'Survived':grd.predict(X_test)})
gender_submission.to_csv('C://Users//zhangshuai_lc//submission_first.csv', index=None)