一、IV计算代码
-
def cal_iv(df, label_col, feat_cols, bin=10):
-
eps = 0.0000000000001
-
target = label_col
-
re_list = []
-
for col in feat_cols:
-
ivs = []
-
df2 = df[[col, label_col]]
-
df2.dropna()
-
if len(df2[col].drop_duplicates()) < 10:
-
continue
-
else:
-
df2 = df2.sort_values(by=col, ascending=True)
-
count_0 = float(df2[df2[target] == 0].shape[0])
-
count_1 = float(df2[df2[target] == 1].shape[0])
-
df2['indexn'] = range(len(df2))
-
divs = int(len(df2) / bin)
-
up = (len(df2) / divs)
-
df2['group'] = [int(item / divs) for item in df2['indexn']]
-
df3 = df2[['group', target]]
-
tmpdata = []
-
for i in range(up):
-
df4 = df3[df3['group'] == i]
-
yi = df4[df4[target] == 1].shape[0] + eps
-
(df4)
-
if yi<1:
-
continue
-
else:
-
df4 = (tmpdata,axis=0)
-
tmpdata = []
-
ni = df4[df4[target] == 0].shape[0] + eps
-
iv = (yi / count_1 - ni / count_0) * ((yi / count_1) / (ni / count_0))
-
if iv > 1:
-
print('group',i, df4.shape[0], df4[target].mean())
-
(iv)
-
iv = round(sum(ivs), 3)
-
re_list.append({'feature': col, 'iv': iv})
-
df_re = (re_list)[['feature', 'iv']]
-
return df_re.sort_values(by='iv', ascending=False)
方法调用:
cal_iv(df_temp, Y, feas_list, bin=10)
df_temp:数据集,Y:是否逾期标签,feas_list:需要计算IV的 变量
2、 按月和不同Y 下的 IV计算
-
def iv_distr_v2(df, flag, feas=feas_list):
-
-
df_mob_iv = ()
-
# 循环不同的 Y
-
for i in ['mob1_15','mob1_30','mob2_30','mob3_30']:
-
print(i)
-
df_temp1 = df[df['{}'.format(i)].notnull()]
-
for j in list(df_temp1[flag].unique()):
-
df_temp2 = df_temp1.loc[df_temp1[flag] == j]
-
print(df_temp1.shape,df_temp2.shape)
-
iv_temp1 = cal_iv(df_temp2, '{}'.format(i), feas_list, bin=10)
-
iv_temp1['month'] = j
-
iv_temp1['Y'] = i
-
-
df_mob_iv = ([df_mob_iv, iv_temp1], axis=0)
-
-
return df_mob_iv
调用代码:
iv_all = iv_distr_v2(df, flag= 'month',feas=feas_list, )
二 、KS、AUC调用代码,只对 值越大逾期越高的分数有效
import pandas as pd
import numpy as np
from import roc_auc_score,roc_curve
## 统计auc
def cal_auc(df, y_true, y_prob):
try:
return roc_auc_score(df[y_true], df[y_prob])
except:
return
## 统计ks
def cal_ks(df, y_true, y_prob):
try:
fpr, tpr, thre_ = roc_curve(df[y_true], df[y_prob])
ks = max(tpr - fpr)
return ks
except:
return