数据来源kaggle上Give Me Some Credit,目的是预测借款人两年内经历财务危机的可能性,帮助借贷人做出最好的选择。结合信用评分卡的构建原理,利用python语言完成数据清洗和处理工作。用逻辑回归模型建立信用评分卡的基础模型。
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
plt.style.use("fivethirtyeight")
%matplotlib inline
input1='../data/cs-test.csv'
input2='../data/cs-training.csv'
test=pd.read_csv(input1)
train=pd.read_csv(input2)
train.sample(10) #在样本中随机抽取10个数据记录进行查看
test.sample(10)
train.info()#查看数据基本信息
test.info()
train.describe()
test.describe()
plt.figure(figsize=(10,8))#标签值可视化
sns.countplot("SeriousDlqin2yrs",data=train)
train[train['age']<18]#查看未满18岁的记录
train.loc[train['age']==0,'age']=train['age'].median()#异常值处理,用中位数填充年龄为0的值
age_working=train.loc[(train['age']>=18) & (train['age']<60)]#按年龄将数据分为退休人员和在职人员
age_senior=train.loc[train['age']>=60]
#用平均值填充缺失值
age_working_income=age_working['MonthlyIncome'].mean()
age_senior_income=age_senior['MonthlyIncome'].mean()
train["MonthlyIncome"] = train["MonthlyIncome"].fillna(99999)
train.loc[((train['age']>=18) & (train['age']<60))&(train["MonthlyIncome"]==99999),"MonthlyIncome"]=age_working_income#用对应平均数填充
train.loc[(train['age']>=60)&(train["MonthlyIncome"]==99999),"MonthlyIncome"]=age_senior_income
train["MonthlyIncome"] = train["MonthlyIncome"].astype('int64')
train["NumberOfDependents"] = train["NumberOfDependents"].fillna(0)#用0填充
train["NumberOfDependents"] = train["NumberOfDependents"].astype('int64')
train["NumberOfDependents"].value_counts()#分类求和
corr=train.corr()#相关性分析
plt.figure(figsize=(14,12))
sns.heatmap(corr, annot=True, fmt=".2g")
从图中可以看出,这些原始属性两两之间的相关性很低,所以需要进行属性组合和重构
#违约组合
train["CombinedDefaulted"]=train['NumberOfTime30-59DaysPastDueNotWorse']+train['NumberOfTime60-89DaysPastDueNotWorse']+train['NumberOfTimes90DaysLate']
train.loc[train["CombinedDefaulted"]>=1,"CombinedDefaulted"]=1
#信贷组合
train["CombinedCreditLoans"] = train["NumberOfOpenCreditLinesAndLoans"] +train["NumberRealEstateLoansOrLines"]
train.loc[(train["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
train.loc[(train["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1
train["CombinedCreditLoans"].value_counts()
train["WithDependents"] = train["NumberOfDependents"]
train.loc[(train["WithDependents"] >= 1), "WithDependents"] = 1
train["WithDependents"].value_counts()
#每月债务支出
train["MonthlyDebtPayments"] = train["DebtRatio"] * train["MonthlyIncome"]
train["MonthlyDebtPayments"] = np.absolute(train["MonthlyDebtPayments"])
train["MonthlyDebtPayments"] = train["MonthlyDebtPayments"].astype('int64')
train['age'].astype('int64')
train["MonthlyDebtPayments"].astype('int64')
train["age_map"] = train["age"]
train.loc[(train["age"] >= 18) & (train["age"] < 60), "age_map"] = 1
train.loc[(train["age"] >= 60), "age_map"] = 0
#替换为分类特征,然后获得虚拟变量
train["age_map"] = train["age_map"].replace(0, "working")
train["age_map"] = train["age_map"].replace(1, "senior")
train= pd.concat([train, pd.get_dummies(train.age_map,prefix='is')], axis=1)
#通过相关矩阵决定保留哪些属性
corr = train.corr()
plt.figure(figsize=(14,12))
sns.heatmap(corr, annot=True, fmt=".2g")
据此可排除一些属性:
train_data.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\ "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse",\ "WithDependents","age_map","is_senior","is_working", "MonthlyDebtPayments"], axis=1, inplace=True)
接下来再对测试数据test进行处理,步骤和train一样,就直接用函数封装了
#测试数据处理,步骤同训练数据
def cleaned_dataset(dataset):
dataset.loc[dataset["age"] <= 18, "age"] = dataset.age.median()
age_working = dataset.loc[(dataset["age"] >= 18) & (dataset["age"] < 60)]
age_senior = dataset.loc[(dataset["age"] >= 60)]
age_working_impute = age_working.MonthlyIncome.mean()
age_senior_impute = age_senior.MonthlyIncome.mean()
dataset["MonthlyIncome"] = np.absolute(dataset["MonthlyIncome"])
dataset["MonthlyIncome"] = dataset["MonthlyIncome"].fillna(99999)
dataset["MonthlyIncome"] = dataset["MonthlyIncome"].astype('int64')
dataset.loc[((dataset["age"] >= 18) & (dataset["age"] < 60)) & (dataset["MonthlyIncome"] == 99999),\
"MonthlyIncome"] = age_working_impute
dataset.loc[(train["age"] >= 60) & (dataset["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute
dataset["NumberOfDependents"] = np.absolute(dataset["NumberOfDependents"])
dataset["NumberOfDependents"] = dataset["NumberOfDependents"].fillna(0)
dataset["NumberOfDependents"] = dataset["NumberOfDependents"].astype('int64')
dataset["CombinedDefaulted"] = (dataset["NumberOfTimes90DaysLate"] + dataset["NumberOfTime60-89DaysPastDueNotWorse"])\
+ dataset["NumberOfTime30-59DaysPastDueNotWorse"]
dataset.loc[(dataset["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1
dataset["CombinedCreditLoans"] = dataset["NumberOfOpenCreditLinesAndLoans"] + \
dataset["NumberRealEstateLoansOrLines"]
dataset.loc[(dataset["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
dataset.loc[(dataset["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1
dataset.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\
"NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse"], axis=1, inplace=True)
cleaned_dataset(test)
#分离标签列和特征列
X = train.drop("SeriousDlqin2yrs", axis=1).copy()#特征列
y = train.SeriousDlqin2yrs#标签列
X_test = test.drop("SeriousDlqin2yrs", axis=1).copy()
y_test = test.SeriousDlqin2yrs
#下面开始构建LogisticRegression模型
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=42)
logit = LogisticRegression(random_state=42, solver="saga", penalty="l1", class_weight="balanced", C=1.0, max_iter=500)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
logit.fit(X_train_scaled, y_train)
logit_scores_proba = logit.predict_proba(X_train_scaled)
logit_scores = logit_scores_proba[:,1]
def plot_roc_curve(fpr, tpr, label=None):
plt.figure(figsize=(12,10))
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0,1],[0,1], "k--")
plt.axis([0,1,0,1])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive rate")
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)#画出ROC曲线
plot_roc_curve(fpr_logit,tpr_logit)