O2O淘宝优惠券代码总结

时间:2024-03-10 14:57:43

一、数据集预处理

  1、数据读入

import pandas as pd
import numpy as np
import datetime as date
import datetime as dt

#先导入数据

off_train = pd.read_csv("data/ccf_offline_stage1_train.csv",header = 0)
off_train.columns = [\'user_id\',\'merchant_id\',\'coupon_id\',\'discount_rate\',\'distance\',\'date_received\',\'date\']
# read_csv 读入数据(header = 0)不读入表头,第二句设置表头
off_test = pd.read_csv("data/ccf_offline_stage1_test_revised.csv",header = 0)
off_test.columns  = [\'user_id\',\'merchant_id\',\'coupon_id\',\'discount_rate\',\'distance\',\'date_received\']

on_train = pd.read_csv("data/ccf_online_stage1_train.csv",header=0)
on_train.columns = [\'user_id\',\'merchant_id\',\'action\',\'coupon_id\',\'discount_rate\',\'date_received\',\'date\']

  

  2、数据划分

# 按照时间划分训练集和测试集
# 滑窗法划分
"""
将2016年1月1日到4月13日的数据提取特征,利用4月14日的到5月14日的作为测试集
将2月1日到5月14日的作为数据集提取特征,利用5月15日6月15日的作为测试集
将3月15日到6月30日作为数据集提取特征,再测试7月1日到7月31日的数据
dataset用来做测试集,feature用来做训练集
"""
#数据集3的特征为 取 线上数据中领券和用券日期大于3月15日和小于6月30日的
#将3月15日到6月30日作为数据集提取特征,再测试7月1日到7月31日的数据
#使数据集3等于test集
""""
dataset里面只有接收优惠券的记录的,无消费记录,可用于预测
feature里面存的是优惠券使用日期或接收优惠券的时间介于3月15日到6月30之间的记录
"""
dataset3 = off_test
feature3 = off_train[((off_train.date>=\'20160315\')&(off_train.date<=\'20160630\')|((off_train.date==\'null\')&(off_train.date_received>=\'20160315\')&(off_train.date_received<=\'20160630\')))]

#提取数据集2的测试集
#将2月1日到5月14日的作为数据集提取特征,利用5月15日6月15日的作为测试集
""""
dataset里面只存放优惠券接收日期介于5月15日到6月15之间的记录
feature里面存的是优惠券使用日期或接收优惠券的时间介于2月1日到5月14之间的记录
"""
dataset2 = off_train[((off_train.date_received>=\'20160515\')&(off_train.date_received<=\'20160615\'))]

feature2 = off_train[(off_train.date>=\'20160201\')&(off_train.date<=\'20160514\')|((off_train.date==\'null\')&(off_train.date_received>=\'20160201\')&(off_train.date_received<=\'20160514\'))]
""""
dataset里面只存放优惠券接收日期介于4月14日到5月14之间的记录
feature里面存的是优惠券使用日期或接收优惠券的时间介于1月1日到4月13之间的记录
"""
dataset1 = off_train[(off_train.date_received>=\'201604014\')&(off_train.date_received<=\'20160514\')]
feature1 = off_train[(off_train.date>=\'20160101\')&(off_train.date<=\'20160413\')|((off_train.date==\'null\')&(off_train.date_received>=\'20160101\')&(off_train.date_received<=\'20160413\'))]

二、特征工程

  1、提取其他特征

"""
# 提取特征:
    用户领取的所有优惠券数目
    ◦用户领取的特定优惠券数目
    ◦用户此次之后/前领取的所有优惠券数目
    ◦用户此次之后/前领取的特定优惠券数目
    ◦用户上/下一次领取的时间间隔
    ◦用户领取特定商家的优惠券数目
    ◦用户领取的不同商家数目
    ◦用户当天领取的优惠券数目
    ◦用户当天领取的特定优惠券数目
    ◦用户领取的所有优惠券种类数目
    ◦商家被领取的优惠券数目
    ◦商家被领取的特定优惠券数目
    ◦商家被多少不同用户领取的数目
    ◦商家发行的所有优惠券种类数目
"""
# 对dataset3进行操作
# 用户收到的优惠券总和
t = dataset3[[\'user_id\']]
t[\'this_month_user_received_all_coupon_count\'] = 1
 #将t按照用户id进行分组,然后统计所有用户收取的优惠券数目,并初始化一个索引值
t = t.groupby(\'user_id\').agg(\'sum\').reset_index()
# 用户收到特定优惠券的总和
t1 = dataset3[[\'user_id\',\'coupon_id\']]
t1[\'this_month_user_receive_same_coupon_count\'] = 1
t1 = t1.groupby([\'user_id\',\'coupon_id\']).agg(\'sum\').reset_index()

# 用户此次之前或之后领使用优惠券的时间
# lambda x:\':\'.join(x)  是添加冒号并在后面去加字符
# 将接收时间的一组按着\':\'分开,这样就可以计算接受了优惠券的数量,apply是合并
# 最大接受的日期max_date_received/min_date_received
t2 = dataset3[[\'user_id\',\'coupon_id\',\'date_received\']]
t2.date_received = t2.date_received.astype(\'str\')
t2 = t2.groupby([\'user_id\',\'coupon_id\'])[\'date_received\'].agg(lambda x:\':\'.join(x)).reset_index()
t2[\'receive_number\'] = t2.date_received.apply(lambda s:len(s.split(\':\')))
t2 = t2[t2.receive_number>1]
t2[\'max_date_received\'] = t2.date_received.apply(lambda s:max([int (d) for d in s.split(\':\')]))
t2[\'min_date_received\'] = t2.date_received.apply(lambda s:min([int (d) for d in s.split(\':\')]))
t2 = t2[[\'user_id\',\'coupon_id\',\'max_date_received\',\'min_date_received\']]

# 将表格中接收优惠券日期中为最近和最远的日期时置为1其余为0,若只接受了一次优惠券为-1
# 将两表融合只保留左表数据,这样得到的表,相当于保留了最近接收时间和最远接受时间
t3 = dataset3[[\'user_id\',\'coupon_id\',\'date_received\']]
t3 = pd.merge(t3,t2,on=[\'user_id\',\'coupon_id\'],how=\'left\')
t3[\'this_month_user_receive_same_coupon_lastone\'] = t3.max_date_received - t3.date_received.astype(int)

t3[\'this_month_user_receive_same_coupon_firstone\'] = t3.date_received.astype(int) - t3.min_date_received

def isfirstlastone(x):
    
    if x == 0:
        return 1
    elif x > 0:
        return 0
    else:
        return -1
# 只接受过一次优惠券为者为 -1


t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(isfirstlastone)
t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(isfirstlastone)

# 第四个特征,一个用户所接收到的所有优惠券的数量
t4 = dataset3[[\'user_id\',\'date_received\']]
t4[\'this_day_user_receive_all_coupon_count\'] = 1
t4 = t4.groupby([\'user_id\',\'date_received\']).agg(\'sum\').reset_index()

# 提取第五个特征,一个用户不同时间所接收到不同优惠券的数量
t5 = dataset3[[\'user_id\',\'coupon_id\',\'date_received\']]
t5[\'this_day_user_receive_same_coupon_count\'] = 1
t5 = t5.groupby([\'user_id\',\'coupon_id\',\'date_received\']).agg(\'sum\').reset_index()

# 一个用户不同优惠券 的接受时间
t6 = dataset3[[\'user_id\',\'coupon_id\',\'date_received\']]
t6.date_received = t6.date_received.astype(\'str\')
t6 = t6.groupby([\'user_id\',\'coupon_id\'])[\'date_received\'].agg(lambda x:\':\'.join(x)).reset_index()
t6.rename(columns ={\'date_received\':\'dates\'},inplace = True)

# 接收优惠券最近的日子天数
def get_day_gap_before(s):
    date_received,dates = s.split(\'-\')
    dates = dates.split(\':\')
    gaps = []
    for d in dates:
#         print(date_received.type())
        this_gap = (dt.date(int(date_received[1:4]),(int(date_received[4:6])),(int(date_received[6:8]))) - dt.date((int(d[1:4])),(int(d[4:6])),(int(d[6:8])))).days
        if this_gap>0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)
# 接收优惠券最远的日子天数
def get_day_gap_after(s):
    date_received,dates = s.split(\'-\')
    dates = dates.split(\':\')
    gaps = []
    for d in dates:
        this_gap = (dt.datetime(int(d[0:4]),int(d[4:6]),int(d[6:8])) - dt.datetime(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
        if this_gap>0:
            gaps.append(this_gap)
    if len(gaps) == 0:
        return -1
    else:
        return min(gaps)

t7 = dataset3[[\'user_id\',\'coupon_id\',\'date_received\']]
t7 = pd.merge(t7,t6,on=[\'user_id\',\'coupon_id\'],how=\'left\')
t7[\'date_received_date\'] = t7.date_received.astype(\'str\') + \'-\' + t7.dates.astype(\'str\')

t7[\'day_gap_before\'] = t7.date_received_date.apply(get_day_gap_before)
t7[\'day_gap_after\']  = t7.date_received_date.apply(get_day_gap_after)
t7 = t7[[\'user_id\',\'coupon_id\',\'date_received\',\'day_gap_before\',\'day_gap_after\']]
# feature3 提取的特征存入CSV中
other_feature3 = pd.merge(t1,t,on=\'user_id\')
other_feature3 = pd.merge(other_feature3,t3,on=[\'user_id\',\'coupon_id\'])
other_feature3 = pd.merge(other_feature3,t4,on=[\'user_id\',\'date_received\'])
other_feature3 = pd.merge(other_feature3,t5,on=[\'user_id\',\'coupon_id\',\'date_received\'])
other_feature3 = pd.merge(other_feature3,t7,on=[\'user_id\',\'coupon_id\',\'date_received\'])
other_feature3.to_csv(\'feature/other_feature3.csv\',index=None)



# 处理dataset2
t = dataset2[[\'user_id\']]
t[\'this_month_user_received_all_coupon_count\'] = 1
t = t.groupby(\'user_id\').agg(\'sum\').reset_index()
t1 = dataset2[[\'user_id\',\'coupon_id\']]
t1[\'this_month_user_receive_same_coupon_count\'] = 1
t1 = t1.groupby([\'user_id\',\'coupon_id\']).agg(\'sum\').reset_index()
t2 = dataset3[[\'user_id\',\'coupon_id\',\'date_received\']]
t2.date_received = t2.date_received.astype(\'str\')
t2 = t2.groupby([\'user_id\',\'coupon_id\'])[\'date_received\'].agg(lambda x:\':\'.join(x)).reset_index()
t2[\'receive_number\'] = t2.date_received.apply(lambda s:len(s.split(\':\')))
t2 = t2[t2.receive_number>1]
t2[\'max_date_received\'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(\':\')]))
t2[\'min_date_received\'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(\':\')]))
t2 = t2[[\'user_id\',\'coupon_id\',\'max_date_received\',\'min_date_received\']]

t3 = dataset2[[\'user_id\',\'coupon_id\',\'date_received\']]
t3 = pd.merge(t3,t2,on=[\'user_id\',\'coupon_id\'],how=\'left\')
t3[\'this_month_user_receive_same_coupon_lastone\'] = t3.max_date_received - t3.date_received.astype(\'int\')
t3[\'this_month_user_receive_same_coupon_firstone\']= t3.date_received.astype(\'int\') - t3.min_date_received
t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(isfirstlastone)
t3.this_month_user_receive_same_coupon_firstone= t3.this_month_user_receive_same_coupon_firstone.apply(isfirstlastone)

t4 = dataset2[[\'user_id\',\'date_received\']]
t4[\'this_day_user_receive_all_coupon_count\'] = 1
t4 = t4.groupby([\'user_id\',\'date_received\']).agg(\'sum\').reset_index()
t5 = dataset2[[\'user_id\',\'coupon_id\',\'date_received\']]
t5[\'this_day_user_receive_same_coupon_count\'] = 1
t5 = t5.groupby([\'user_id\',\'coupon_id\',\'date_received\']).agg(\'sum\').reset_index()

t6 = dataset2[[\'user_id\',\'coupon_id\',\'date_received\']]
t6.date_received = t6.date_received.astype(\'str\')
t6 = t6.groupby([\'user_id\',\'coupon_id\'])[\'date_received\'].agg(lambda x:\':\'.join(x)).reset_index()
t6.rename(columns={\'date_received\':\'dates\'},inplace=True)

t7 = dataset2[[\'user_id\',\'coupon_id\',\'date_received\']]
t7 = pd.merge(t7,t6,on=[\'user_id\',\'coupon_id\'],how=\'left\')
t7[\'date_received_date\'] = t7.date_received.astype(\'str\') + \'-\' + t7.dates
t7[\'day_gap_before\'] = t7.date_received_date.apply(get_day_gap_before)
t7[\'day_gap_after\']  = t7.date_received_date.apply(get_day_gap_before)
t7 = t7[[\'user_id\',\'coupon_id\',\'date_received\',\'day_gap_before\',\'day_gap_after\']]

other_feature2 = pd.merge(t1,t,on=\'user_id\')
other_feature2 = pd.merge(other_feature2,t3,on=[\'user_id\',\'coupon_id\'])
other_feature2 = pd.merge(other_feature2,t4,on=[\'user_id\',\'date_received\'])
other_feature2 = pd.merge(other_feature2,t5,on=[\'user_id\',\'coupon_id\',\'date_received\'])
other_feature2 = pd.merge(other_feature2,t7,on=[\'user_id\',\'coupon_id\',\'date_received\'])
other_feature2.to_csv(\'feature/other_feature2.csv\',index=None)

# 处理dataset1
t = dataset1[[\'user_id\']]
t[\'this_month_user_received_all_coupon_count\'] = 1
t = t.groupby(\'user_id\').agg(\'sum\').reset_index()
t1 = dataset1[[\'user_id\',\'coupon_id\']]
t1[\'this_month_user_receive_same_coupon_count\'] = 1
t1 = t1.groupby([\'user_id\',\'coupon_id\']).agg(\'sum\').reset_index()
t2 = dataset1[[\'user_id\',\'coupon_id\',\'date_received\']]
t2.date_received = t2.date_received.astype(\'str\')
t2 = t2.groupby([\'user_id\',\'coupon_id\'])[\'date_received\'].agg(lambda x:\':\'.join(x)).reset_index()
t2[\'receive_number\'] = t2.date_received.apply(lambda s:len(s.split(\':\')))
t2 = t2[t2.receive_number>1]
t2[\'max_date_received\'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(\':\')]))
t2[\'min_date_received\'] = t2.date_received.apply(lambda s:max([int(d) for d in s.split(\':\')]))
t2 = t2[[\'user_id\',\'coupon_id\',\'max_date_received\',\'min_date_received\']]

t3 = dataset1[[\'user_id\',\'coupon_id\',\'date_received\']]
t3 = pd.merge(t3,t2,on=[\'user_id\',\'coupon_id\'],how=\'left\')
t3[\'this_month_user_receive_same_coupon_lastone\'] = t3.max_date_received - t3.date_received.astype(\'int\')
t3[\'this_month_user_receive_same_coupon_firstone\']= t3.date_received.astype(\'int\') - t3.min_date_received
t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(isfirstlastone)
t3.this_month_user_receive_same_coupon_firstone= t3.this_month_user_receive_same_coupon_firstone.apply(isfirstlastone)

t4 = dataset1[[\'user_id\',\'date_received\']]
t4[\'this_day_user_receive_all_coupon_count\'] = 1
t4 = t4.groupby([\'user_id\',\'date_received\']).agg(\'sum\').reset_index()
t5 = dataset1[[\'user_id\',\'coupon_id\',\'date_received\']]
t5[\'this_day_user_receive_same_coupon_count\'] = 1
t5 = t5.groupby([\'user_id\',\'coupon_id\',\'date_received\']).agg(\'sum\').reset_index()

t6 = dataset1[[\'user_id\',\'coupon_id\',\'date_received\']]
t6.date_received = t6.date_received.astype(\'str\')
t6 = t6.groupby([\'user_id\',\'coupon_id\'])[\'date_received\'].agg(lambda x:\':\'.join(x)).reset_index()
t6.rename(columns={\'date_received\':\'dates\'},inplace=True)

t7 = dataset1[[\'user_id\',\'coupon_id\',\'date_received\']]
t7 = pd.merge(t7,t6,on=[\'user_id\',\'coupon_id\'],how=\'left\')
t7[\'date_received_date\'] = t7.date_received.astype(\'str\') + \'-\' + t7.dates
t7[\'day_gap_before\'] = t7.date_received_date.apply(get_day_gap_before)
t7[\'day_gap_after\']  = t7.date_received_date.apply(get_day_gap_before)
t7 = t7[[\'user_id\',\'coupon_id\',\'date_received\',\'day_gap_before\',\'day_gap_after\']]

other_feature1 = pd.merge(t1,t,on=\'user_id\')
other_feature1 = pd.merge(other_feature1,t3,on=[\'user_id\',\'coupon_id\'])
other_feature1 = pd.merge(other_feature1,t4,on=[\'user_id\',\'date_received\'])
other_feature1 = pd.merge(other_feature1,t5,on=[\'user_id\',\'coupon_id\',\'date_received\'])
other_feature1 = pd.merge(other_feature1,t7,on=[\'user_id\',\'coupon_id\',\'date_received\'])
other_feature1.to_csv(\'feature/other_feature1.csv\',index=None)

  2、提取优惠券相关特征

# 统一转化为打折卷
def calc_discount_rate(s):
    s = str(s)
    s = s.split(\':\')
    if len(s) == 1:
        return float(s[0])
    else:
        return 1.0-float(s[1])/float(s[0])
def get_discount_man(s):
    s = str(s)
    s = s.split(\':\')
    if len(s) == 1:
        return \'null\'
    else:
        return int(s[0])
def get_discount_jian(s):
    s = str(s)
    s = s.split(\':\')
    if len(s) == 1:
        return \'null\'
    else:
        return int(s[1])
def is_man_jian(s):
    s = str(s)
    s = s.split(\':\')
    if len(s) == 1:
        return 0
    else:
        return 1
# 处理数据集3,处理时间属性,显示时间是第几周
dataset3[\'day_of_week\']  = dataset3.date_received.astype(\'str\').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1))
dataset3[\'day_of_month\'] = dataset3.date_received.astype(\'str\').apply(lambda x:int(x[6:8]))
dataset3[\'days_distance\']= dataset3.date_received.astype(\'str\').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-dt.date(2016,6,30)).days)
dataset3[\'discount_man\'] = dataset3.discount_rate.apply(get_discount_man)
dataset3[\'discount_jian\']= dataset3.discount_rate.apply(get_discount_jian)
dataset3[\'is_man_jian\']  = dataset3.discount_rate.apply(is_man_jian)
dataset3[\'discount_rate\']= dataset3.discount_rate.apply(calc_discount_rate)
d = dataset3[[\'coupon_id\']]
d[\'coupon_count\'] = 1
d = d.groupby(\'coupon_id\').agg(\'sum\').reset_index()
dataset3 = pd.merge(dataset3,d,on=\'coupon_id\',how=\'left\')
dataset3.to_csv(\'feature/coupon3_feature.csv\',index=None)

# 数据集2
dataset2[\'day_of_week\']  = dataset2.date_received.astype(\'str\').apply(lambda x:dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
dataset2[\'day_of_month\'] = dataset2.date_received.astype(\'str\').apply(lambda x:int(x[4:6]))
dataset2[\'days_distance\']= dataset2.date_received.astype(\'str\').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-dt.date(2016,5,14)).days)
dataset2[\'discount_man\'] = dataset2.discount_rate.apply(get_discount_man)
dataset2[\'discount_jian\']= dataset2.discount_rate.apply(get_discount_jian)
dataset2[\'is_man_jian\']  = dataset2.discount_rate.apply(is_man_jian)
dataset2[\'discount_rate\']= dataset2.discount_rate.apply(calc_discount_rate)
d = dataset2[[\'coupon_id\']]
d[\'coupon_count\'] = 1
d = d.groupby(\'coupon_id\').agg(\'sum\').reset_index()
dataset2 = pd.merge(dataset2,d,on=\'coupon_id\',how=\'left\')
dataset2.to_csv(\'feature/coupon2_feature.csv\',index=None)

# 数据集1
dataset1[\'day_of_week\']  = dataset1.date_received.astype(\'str\').apply(lambda x:dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
dataset1[\'day_of_month\'] = dataset1.date_received.astype(\'str\').apply(lambda x:int(x[4:6]))
dataset1[\'days_distance\']= dataset1.date_received.astype(\'str\').apply(lambda x:(dt.date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-dt.date(2016,4,16)).days)
dataset1[\'discount_man\'] = dataset1.discount_rate.apply(get_discount_man)
dataset1[\'discount_jian\']= dataset1.discount_rate.apply(get_discount_jian)
dataset1[\'is_man_jian\']  = dataset1.discount_rate.apply(is_man_jian)
dataset1[\'discount_rate\']= dataset1.discount_rate.apply(calc_discount_rate)
d = dataset1[[\'coupon_id\']]
d[\'coupon_count\'] = 1
d = d.groupby(\'coupon_id\').agg(\'sum\').reset_index()
dataset1 = pd.merge(dataset1,d,on=\'coupon_id\',how=\'left\')
dataset1.to_csv(\'feature/coupon1_feature.csv\',index=None)

  3、提取商户相关特征

merchant3 = feature3[[\'merchant_id\',\'coupon_id\',\'distance\',\'date_received\',\'date\']]
t = merchant3[[\'merchant_id\']]
# 删除重复的行数据
t.drop_duplicates(inplace=True)
# 显示卖出的商品,以及卖出的数量
# []用来强调条件\或者新建一列并赋值   [[]]用来表示取哪个列来使用
t1 = merchant3[merchant3.date!=\'null\'][[\'merchant_id\']]
t1[\'total_sales\'] = 1
t1 = t1.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 显示使用了优惠券消费的商品,正样本
t2 = merchant3[(merchant3.date!=\'null\')&(merchant3.coupon_id!=\'null\')][[\'merchant_id\']]
t2[\'sales_use_coupon\'] = 1
t2 = t2.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 提取商品优惠券的总数量
t3 = merchant3[merchant3.coupon_id!=\'null\'][[\'merchant_id\']]
t3[\'total_coupon\'] = 1
t3 = t3.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 提取销量与距离的关系
# 把数据中的空值全部替换为 -1
t4 = merchant3[(merchant3.date!=\'null\')&(merchant3.coupon_id!=\'null\')][[\'merchant_id\',\'distance\']]
t4.replace(\'null\',-1,inplace=True)
t4.distance = t4.distance.astype(\'int\')
t4.replace(-1,np.nan,inplace=True)
# 提取用户和商店距离的最小值
t5 = t4.groupby(\'merchant_id\').agg(\'min\').reset_index()
t5.rename(columns={\'distance\':\'merchant_min_distance\'},inplace = True)
# 提取用户和商店距离的最大值
t6 = t4.groupby(\'merchant_id\').agg(\'max\').reset_index()
t5.rename(columns={\'distance\':\'merchant_max_distance\'},inplace = True)
# 提取用户和商品距离的平均值
t7 = t4.groupby(\'merchant_id\').agg(\'mean\').reset_index()
t7.rename(columns={\'distance\':\'merchant_mean_distance\'},inplace = True)
# 提取用户与商品距离的中位数
t8 = t4.groupby(\'merchant_id\').agg(\'median\').reset_index()
# 把特征集合入一张表里
merchant3_feature = pd.merge(t,t1,on=\'merchant_id\',how=\'left\')
merchant3_feature = pd.merge(merchant3_feature,t2,on=\'merchant_id\',how=\'left\')
merchant3_feature = pd.merge(merchant3_feature,t3,on=\'merchant_id\',how=\'left\')
merchant3_feature = pd.merge(merchant3_feature,t5,on=\'merchant_id\',how=\'left\')
merchant3_feature = pd.merge(merchant3_feature,t6,on=\'merchant_id\',how=\'left\')
merchant3_feature = pd.merge(merchant3_feature,t7,on=\'merchant_id\',how=\'left\')
merchant3_feature = pd.merge(merchant3_feature,t8,on=\'merchant_id\',how=\'left\')
# merchant3_feature.head()
# 替换数据中的NAN为0,便于计算优惠券的使用率以及其他信息
# 优惠券的使用率、卖出的商品中使用优惠券的占比
merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0)
merchant3_feature[\'merchant_coupon_transfer_rate\'] = merchant3_feature.sales_use_coupon.astype(\'float\') / merchant3_feature.total_sales
merchant3_feature[\'coupon_rate\'] = merchant3_feature.sales_use_coupon.astype(\'float\') / merchant3_feature.total_sales
merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0)
merchant3_feature.to_csv(\'feature/merchant3_feature.csv\',index=None)


# 对feature2进行操作
merchant2 = feature2[[\'merchant_id\',\'coupon_id\',\'distance\',\'date_received\',\'date\']]
t = merchant2[[\'merchant_id\']]
# 删除重复的行数据
t.drop_duplicates(inplace=True)
# 显示卖出的商品,以及卖出的数量
# []用来强调条件\或者新建一列并赋值   [[]]用来表示取哪个列来使用
t1 = merchant2[merchant2.date!=\'null\'][[\'merchant_id\']]
t1[\'total_sales\'] = 1
t1 = t1.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 显示使用了优惠券消费的商品,正样本
t2 = merchant2[(merchant2.date!=\'null\')&(merchant2.coupon_id!=\'null\')][[\'merchant_id\']]
t2[\'sales_use_coupon\'] = 1
t2 = t2.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 提取商品优惠券的总数量
t3 = merchant2[merchant2.coupon_id!=\'null\'][[\'merchant_id\']]
t3[\'total_coupon\'] = 1
t3 = t3.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 提取销量与距离的关系
# 把数据中的空值全部替换为 -1
t4 = merchant2[(merchant2.date!=\'null\')&(merchant2.coupon_id!=\'null\')][[\'merchant_id\',\'distance\']]
t4.replace(\'null\',-1,inplace=True)
t4.distance = t4.distance.astype(\'int\')
t4.replace(-1,np.nan,inplace=True)
# 提取用户和商店距离的最小值
t5 = t4.groupby(\'merchant_id\').agg(\'min\').reset_index()
t5.rename(columns={\'distance\':\'merchant_min_distance\'},inplace = True)
# 提取用户和商店距离的最大值
t6 = t4.groupby(\'merchant_id\').agg(\'max\').reset_index()
t5.rename(columns={\'distance\':\'merchant_max_distance\'},inplace = True)
# 提取用户和商品距离的平均值
t7 = t4.groupby(\'merchant_id\').agg(\'mean\').reset_index()
t7.rename(columns={\'distance\':\'merchant_mean_distance\'},inplace = True)
# 提取用户与商品距离的中位数
t8 = t4.groupby(\'merchant_id\').agg(\'median\').reset_index()
# 把特征集合入一张表里
merchant2_feature = pd.merge(t,t1,on=\'merchant_id\',how=\'left\')
merchant2_feature = pd.merge(merchant2_feature,t2,on=\'merchant_id\',how=\'left\')
merchant2_feature = pd.merge(merchant2_feature,t3,on=\'merchant_id\',how=\'left\')
merchant2_feature = pd.merge(merchant2_feature,t5,on=\'merchant_id\',how=\'left\')
merchant2_feature = pd.merge(merchant2_feature,t6,on=\'merchant_id\',how=\'left\')
merchant2_feature = pd.merge(merchant2_feature,t7,on=\'merchant_id\',how=\'left\')
merchant2_feature = pd.merge(merchant2_feature,t8,on=\'merchant_id\',how=\'left\')
# merchant3_feature.head()
# 替换数据中的NAN为0,便于计算优惠券的使用率以及其他信息
# 优惠券的使用率、卖出的商品中使用优惠券的占比
merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(np.nan,0)
merchant2_feature[\'merchant_coupon_transfer_rate\'] = merchant2_feature.sales_use_coupon.astype(\'float\') / merchant2_feature.total_sales
merchant2_feature[\'coupon_rate\'] = merchant2_feature.sales_use_coupon.astype(\'float\') / merchant2_feature.total_sales
merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(np.nan,0)
merchant2_feature.to_csv(\'feature/merchant2_feature.csv\',index=None)



# 对feature1进行操作
merchant1 = feature1[[\'merchant_id\',\'coupon_id\',\'distance\',\'date_received\',\'date\']]
t = merchant1[[\'merchant_id\']]
# 删除重复的行数据
t.drop_duplicates(inplace=True)
# 显示卖出的商品,以及卖出的数量
# []用来强调条件\或者新建一列并赋值   [[]]用来表示取哪个列来使用
t1 = merchant1[merchant1.date!=\'null\'][[\'merchant_id\']]
t1[\'total_sales\'] = 1
t1 = t1.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 显示使用了优惠券消费的商品,正样本
t2 = merchant1[(merchant1.date!=\'null\')&(merchant1.coupon_id!=\'null\')][[\'merchant_id\']]
t2[\'sales_use_coupon\'] = 1
t2 = t2.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 提取商品优惠券的总数量
t3 = merchant1[merchant1.coupon_id!=\'null\'][[\'merchant_id\']]
t3[\'total_coupon\'] = 1
t3 = t3.groupby(\'merchant_id\').agg(\'sum\').reset_index()
# 提取销量与距离的关系
# 把数据中的空值全部替换为 -1
t4 = merchant1[(merchant1.date!=\'null\')&(merchant1.coupon_id!=\'null\')][[\'merchant_id\',\'distance\']]
t4.replace(\'null\',-1,inplace=True)
t4.distance = t4.distance.astype(\'int\')
t4.replace(-1,np.nan,inplace=True)
# 提取用户和商店距离的最小值
t5 = t4.groupby(\'merchant_id\').agg(\'min\').reset_index()
t5.rename(columns={\'distance\':\'merchant_min_distance\'},inplace = True)
# 提取用户和商店距离的最大值
t6 = t4.groupby(\'merchant_id\').agg(\'max\').reset_index()
t5.rename(columns={\'distance\':\'merchant_max_distance\'},inplace = True)
# 提取用户和商品距离的平均值
t7 = t4.groupby(\'merchant_id\').agg(\'mean\').reset_index()
t7.rename(columns={\'distance\':\'merchant_mean_distance\'},inplace = True)
# 提取用户与商品距离的中位数
t8 = t4.groupby(\'merchant_id\').agg(\'median\').reset_index()
# 把特征集合入一张表里
merchant1_feature = pd.merge(t,t1,on=\'merchant_id\',how=\'left\')
merchant1_feature = pd.merge(merchant1_feature,t2,on=\'merchant_id\',how=\'left\')
merchant1_feature = pd.merge(merchant1_feature,t3,on=\'merchant_id\',how=\'left\')
merchant1_feature = pd.merge(merchant1_feature,t5,on=\'merchant_id\',how=\'left\')
merchant1_feature = pd.merge(merchant1_feature,t6,on=\'merchant_id\',how=\'left\')
merchant1_feature = pd.merge(merchant1_feature,t7,on=\'merchant_id\',how=\'left\')
merchant1_feature = pd.merge(merchant1_feature,t8,on=\'merchant_id\',how=\'left\')
# merchant3_feature.head()
# 替换数据中的NAN为0,便于计算优惠券的使用率以及其他信息
# 优惠券的使用率、卖出的商品中使用优惠券的占比
merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(np.nan,0)
merchant1_feature[\'merchant_coupon_transfer_rate\'] = merchant1_feature.sales_use_coupon.astype(\'float\') / merchant1_feature.total_sales
merchant1_feature[\'coupon_rate\'] = merchant1_feature.sales_use_coupon.astype(\'float\') / merchant1_feature.total_sales
merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(np.nan,0)
merchant1_feature.to_csv(\'feature/merchant1_feature.csv\',index=None)

  4、提取用户的相关特征

"""
用户的相关信息:
    count_merchant
    user_avg_distance,user_min_distance,user_max_distance
    buy_use_coupon,buy_total,coupon_received
    buy_use_coupon/coupon_received
    buy_use_coupon/buy_total
    user_date_datereceived_gap
"""
def get_user_date_datereceived_gap(s):
    s = s.split(\':\')
    return(dt.date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - dt.date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days
# 数据集3的处理
user3 = feature3[[\'user_id\',\'merchant_id\',\'coupon_id\',\'discount_rate\',\'distance\',\'date_received\',\'date\']]
t = user3[[\'user_id\']]
# 去掉数据中重复的用户ID
t.drop_duplicates(inplace=True)
# 用户购买商品的种类
t1 = user3[user3.date!=\'null\'][[\'user_id\',\'merchant_id\']]
t1.drop_duplicates(inplace=True)
t1.merchant_id = 1
t1 = t1.groupby(\'user_id\').agg(\'sum\').reset_index()
t1.rename(columns={\'merchant_id\':\'count_merchant\'},inplace=True)
# 使用了优惠券购买商品的用户id和距离
t2 = user3[(user3.date!=\'null\')&(user3.coupon_id!=\'null\')][[\'user_id\',\'distance\']]
t2.replace(\'null\',-1,inplace=True)
t2.distance = t2.distance.astype(\'int\')
t2.replace(-1,np.nan,inplace=True)
# 得到使用优惠券购买商品的用户距商店的最短距离
t3 = t2.groupby(\'user_id\').agg(\'min\').reset_index()
t3.rename(columns={\'distance\':\'user_min_dsitance\'},inplace=True)
# 最大距离
t4 = t2.groupby(\'user_id\').agg(\'max\').reset_index()
t4.rename(columns={\'distance\':\'user_max_distance\'},inplace=True)
# 平均距离
t5 = t2.groupby(\'user_id\').agg(\'mean\').reset_index()
t5.rename(columns={\'distance\':\'user_mean_distance\'},inplace=True)
# 中位数距离
t6 = t2.groupby(\'user_id\').agg(\'median\').reset_index()
t6.rename(columns={\'distance\':\'user_median_distance\'},inplace=True)
# 每个用户使用优惠券购买的商品数量
t7 = user3[(user3.date!=\'null\')&(user3.coupon_id!=\'null\')][[\'user_id\']]
t7[\'buy_use_coupon\'] = 1
t7 = t7.groupby(\'user_id\').agg(\'sum\').reset_index()
# 购买商品的总数
t8 = user3[user3.date!=\'null\'][[\'user_id\']]
t8[\'buy_total\'] = 1
t8 = t8.groupby(\'user_id\').agg(\'sum\').reset_index()
# 接收优惠券的总数
t9 = user3[user3.coupon_id!=\'null\'][[\'user_id\']]
t9[\'coupon_received\'] = 1
t9 = t9.groupby(\'user_id\').agg(\'sum\').reset_index()
# 收到优惠券的日期和使用之间的距离
t10 = user3[(user3.date_received !=\'null\')&(user3.date!=\'null\')][[\'user_id\',\'date_received\',\'date\']]
t10[\'user_date_datereceived_gap\'] = t10.date+\':\'+t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
t10 = t10[[\'user_id\',\'user_date_datereceived_gap\']]
# 将用户优惠券使用时间的间隔取平均值
t11 = t10.groupby(\'user_id\').agg(\'mean\').reset_index()
t11.rename(columns={\'user_date_datereceived_gap\':\'avg_user_date_datereceived_gap\'},inplace=True)
# 间隔天数的最小值
t12 = t10.groupby(\'user_id\').agg(\'min\').reset_index()
t12.rename(columns={\'user_date_datereceived_gap\':\'min_user_date_datereceived_gap\'},inplace=True)
# 间隔天数的最大值
t13 = t10.groupby(\'user_id\').agg(\'max\').reset_index()
t13.rename(columns={\'user_date_datereceived_gap\':\'max_user_date_datereceived_gap\'},inplace=True)
# 合并特征
user3_feature = pd.merge(t,t1,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t3,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t4,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t5,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t6,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t7,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t8,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t9,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t11,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t12,on=\'user_id\',how=\'left\')
user3_feature = pd.merge(user3_feature,t13,on=\'user_id\',how=\'left\')
user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
user3_feature.buy_user_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
user3_feature[\'buy_use_coupon_rate\'] = user3_feature.buy_use_coupon.astype(\'float\')/user3_feature.buy_total.astype(\'float\')
user3_feature[\'user_coupon_transfer_rate\'] = user3_feature.buy_use_coupon.astype(\'float\')/user3_feature.buy_use_coupon.astype(\'float\')
user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)
user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
user3_feature.to_csv(\'feature/user3_feature.csv\',index=None)


# 对数据集faeture2操作
user2 = feature2[[\'user_id\',\'merchant_id\',\'coupon_id\',\'discount_rate\',\'distance\',\'date_received\',\'date\']]
t = user2[[\'user_id\']]
# 去掉数据中重复的用户ID
t.drop_duplicates(inplace=True)
# 用户购买商品的种类
t1 = user2[user2.date!=\'null\'][[\'user_id\',\'merchant_id\']]
t1.drop_duplicates(inplace=True)
t1.merchant_id = 1
t1 = t1.groupby(\'user_id\').agg(\'sum\').reset_index()
t1.rename(columns={\'merchant_id\':\'count_merchant\'},inplace=True)
# 使用了优惠券购买商品的用户id和距离
t2 = user2[(user2.date!=\'null\')&(user2.coupon_id!=\'null\')][[\'user_id\',\'distance\']]
t2.replace(\'null\',-1,inplace=True)
t2.distance = t2.distance.astype(\'int\')
t2.replace(-1,np.nan,inplace=True)
# 得到使用优惠券购买商品的用户距商店的最短距离
t3 = t2.groupby(\'user_id\').agg(\'min\').reset_index()
t3.rename(columns={\'distance\':\'user_min_dsitance\'},inplace=True)
# 最大距离
t4 = t2.groupby(\'user_id\').agg(\'max\').reset_index()
t4.rename(columns={\'distance\':\'user_max_distance\'},inplace=True)
# 平均距离
t5 = t2.groupby(\'user_id\').agg(\'mean\').reset_index()
t5.rename(columns={\'distance\':\'user_mean_distance\'},inplace=True)
# 中位数距离
t6 = t2.groupby(\'user_id\').agg(\'median\').reset_index()
t6.rename(columns={\'distance\':\'user_median_distance\'},inplace=True)
# 每个用户使用优惠券购买的商品数量
t7 = user2[(user2.date!=\'null\')&(user2.coupon_id!=\'null\')][[\'user_id\']]
t7[\'buy_use_coupon\'] = 1
t7 = t7.groupby(\'user_id\').agg(\'sum\').reset_index()
# 购买商品的总数
t8 = user2[user2.date!=\'null\'][[\'user_id\']]
t8[\'buy_total\'] = 1
t8 = t8.groupby(\'user_id\').agg(\'sum\').reset_index()
# 接收优惠券的总数
t9 = user2[user2.coupon_id!=\'null\'][[\'user_id\']]
t9[\'coupon_received\'] = 1
t9 = t9.groupby(\'user_id\').agg(\'sum\').reset_index()
# 收到优惠券的日期和使用之间的距离
t10 = user2[(user2.date_received !=\'null\')&(user2.date!=\'null\')][[\'user_id\',\'date_received\',\'date\']]
t10[\'user_date_datereceived_gap\'] = t10.date+\':\'+t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
t10 = t10[[\'user_id\',\'user_date_datereceived_gap\']]
# 将用户优惠券使用时间的间隔取平均值
t11 = t10.groupby(\'user_id\').agg(\'mean\').reset_index()
t11.rename(columns={\'user_date_datereceived_gap\':\'avg_user_date_datereceived_gap\'},inplace=True)
# 间隔天数的最小值
t12 = t10.groupby(\'user_id\').agg(\'min\').reset_index()
t12.rename(columns={\'user_date_datereceived_gap\':\'min_user_date_datereceived_gap\'},inplace=True)
# 间隔天数的最大值
t13 = t10.groupby(\'user_id\').agg(\'max\').reset_index()
t13.rename(columns={\'user_date_datereceived_gap\':\'max_user_date_datereceived_gap\'},inplace=True)
# 合并特征
user2_feature = pd.merge(t,t1,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t3,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t4,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t5,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t6,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t7,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t8,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t9,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t11,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t12,on=\'user_id\',how=\'left\')
user2_feature = pd.merge(user2_feature,t13,on=\'user_id\',how=\'left\')
user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan,0)
user2_feature.buy_user_coupon = user2_feature.buy_use_coupon.replace(np.nan,0)
user2_feature[\'buy_use_coupon_rate\'] = user2_feature.buy_use_coupon.astype(\'float\')/user2_feature.buy_total.astype(\'float\')
user2_feature[\'user_coupon_transfer_rate\'] = user2_feature.buy_use_coupon.astype(\'float\')/user2_feature.buy_use_coupon.astype(\'float\')
user2_feature.buy_total = user2_feature.buy_total.replace(np.nan,0)
user2_feature.coupon_received = user2_feature.coupon_received.replace(np.nan,0)
user2_feature.to_csv(\'feature/user2_feature.csv\',index=None)


# 对数据集dataset1操作
user1 = feature1[[\'user_id\',\'merchant_id\',\'coupon_id\',\'discount_rate\',\'distance\',\'date_received\',\'date\']]
t = user1[[\'user_id\']]
# 去掉数据中重复的用户ID
t.drop_duplicates(inplace=True)
# 用户购买商品的种类
t1 = user1[user1.date!=\'null\'][[\'user_id\',\'merchant_id\']]
t1.drop_duplicates(inplace=True)
t1.merchant_id = 1
t1 = t1.groupby(\'user_id\').agg(\'sum\').reset_index()
t1.rename(columns={\'merchant_id\':\'count_merchant\'},inplace=True)
# 使用了优惠券购买商品的用户id和距离
t2 = user1[(user1.date!=\'null\')&(user1.coupon_id!=\'null\')][[\'user_id\',\'distance\']]
t2.replace(\'null\',-1,inplace=True)
t2.distance = t2.distance.astype(\'int\')
t2.replace(-1,np.nan,inplace=True)
# 得到使用优惠券购买商品的用户距商店的最短距离
t3 = t2.groupby(\'user_id\').agg(\'min\').reset_index()
t3.rename(columns={\'distance\':\'user_min_dsitance\'},inplace=True)
# 最大距离
t4 = t2.groupby(\'user_id\').agg(\'max\').reset_index()
t4.rename(columns={\'distance\':\'user_max_distance\'},inplace=True)
# 平均距离
t5 = t2.groupby(\'user_id\').agg(\'mean\').reset_index()
t5.rename(columns={\'distance\':\'user_mean_distance\'},inplace=True)
# 中位数距离
t6 = t2.groupby(\'user_id\').agg(\'median\').reset_index()
t6.rename(columns={\'distance\':\'user_median_distance\'},inplace=True)
# 每个用户使用优惠券购买的商品数量
t7 = user1[(user3.date!=\'null\')&(user1.coupon_id!=\'null\')][[\'user_id\']]
t7[\'buy_use_coupon\'] = 1
t7 = t7.groupby(\'user_id\').agg(\'sum\').reset_index()
# 购买商品的总数
t8 = user1[user1.date!=\'null\'][[\'user_id\']]
t8[\'buy_total\'] = 1
t8 = t8.groupby(\'user_id\').agg(\'sum\').reset_index()
# 接收优惠券的总数
t9 = user1[user1.coupon_id!=\'null\'][[\'user_id\']]
t9[\'coupon_received\'] = 1
t9 = t9.groupby(\'user_id\').agg(\'sum\').reset_index()
# 收到优惠券的日期和使用之间的距离
t10 = user1[(user1.date_received !=\'null\')&(user1.date!=\'null\')][[\'user_id\',\'date_received\',\'date\']]
t10[\'user_date_datereceived_gap\'] = t10.date+\':\'+t10.date_received
t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
t10 = t10[[\'user_id\',\'user_date_datereceived_gap\']]
# 将用户优惠券使用时间的间隔取平均值
t11 = t10.groupby(\'user_id\').agg(\'mean\').reset_index()
t11.rename(columns={\'user_date_datereceived_gap\':\'avg_user_date_datereceived_gap\'},inplace=True)
# 间隔天数的最小值
t12 = t10.groupby(\'user_id\').agg(\'min\').reset_index()
t12.rename(columns={\'user_date_datereceived_gap\':\'min_user_date_datereceived_gap\'},inplace=True)
# 间隔天数的最大值
t13 = t10.groupby(\'user_id\').agg(\'max\').reset_index()
t13.rename(columns={\'user_date_datereceived_gap\':\'max_user_date_datereceived_gap\'},inplace=True)
# 合并特征
user1_feature = pd.merge(t,t1,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t3,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t4,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t5,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t6,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t7,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t8,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t9,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t11,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t12,on=\'user_id\',how=\'left\')
user1_feature = pd.merge(user1_feature,t13,on=\'user_id\',how=\'left\')
user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan,0)
user1_feature.buy_user_coupon = user1_feature.buy_use_coupon.replace(np.nan,0)
user1_feature[\'buy_use_coupon_rate\'] = user1_feature.buy_use_coupon.astype(\'float\')/user1_feature.buy_total.astype(\'float\')
user1_feature[\'user_coupon_transfer_rate\'] = user1_feature.buy_use_coupon.astype(\'float\')/user1_feature.buy_use_coupon.astype(\'float\')
user1_feature.buy_total = user1_feature.buy_total.replace(np.nan,0)
user1_feature.coupon_received = user1_feature.coupon_received.replace(np.nan,0)
user1_feature.to_csv(\'feature/user1_feature.csv\',index=None)

  5、用户和商店之间联系的特征

# 对数据集feature3进行处理

# 用户和商店之间联系的特征
all_user_merchant = feature3[[\'user_id\',\'merchant_id\']]
all_user_merchant.drop_duplicates(inplace=True)
# 只保留销售了商品的商户id
t = feature3[[\'user_id\',\'merchant_id\',\'date\']]
t = t[t.date!=\'null\'][[\'user_id\',\'merchant_id\']]
# 用户一共买了特定商户多少商品
t[\'user_merchant_buy_total\'] = 1
t = t.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t.drop_duplicates(inplace=True)
t1 = feature3[[\'user_id\',\'merchant_id\',\'coupon_id\']]
t1 = t1[t1.coupon_id!=\'null\'][[\'user_id\',\'merchant_id\']]
# 用户一共收到一个商户的优惠劵数目
t[\'user_merchant_received\'] = 1
t1 = t1.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t1.drop_duplicates(inplace = True)

t2 = feature3[[\'user_id\',\'merchant_id\',\'date\',\'date_received\']]
t2 = t2[(t2.date!=\'null\')&(t2.date_received!=\'null\')][[\'user_id\',\'merchant_id\']]
# 用户在一家商户使用优惠券购买的商品数目
t2[\'user_merchant_buy_use_coupon\'] = 1
t2 = t2.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t2.drop_duplicates(inplace = True)
# 用户在一家商家的说有记录总数
t3 = feature3[[\'user_id\',\'merchant_id\']]
t3[\'user_merchant_any\'] = 1
t3 = t3.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t3.drop_duplicates(inplace=True)
# 用户未使用优惠券购买的商品数目
t4 = feature3[[\'user_id\',\'merchant_id\',\'date\',\'coupon_id\']]
t4 = t4[(t4.date!=\'null\')&(t4.coupon_id==\'null\')][[\'user_id\',\'merchant_id\']]
t4[\'user_merchant_buy_common\'] = 1
t4 = t4.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t4.drop_duplicates(inplace = True)
user_merchant3 = pd.merge(all_user_merchant,t,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant3 = pd.merge(user_merchant3,t1,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant3 = pd.merge(user_merchant3,t2,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant3 = pd.merge(user_merchant3,t3,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant3 = pd.merge(user_merchant3,t4,on=[\'user_id\',\'merchant_id\'],how=\'left\')
# 都是针对一个商户和一个用户
user_merchant3[\'user_merchant_coupon_transfer_rate\'] = user_merchant3.user_merchant_buy_use_coupon.astype(\'float\') / user_merchant3.user_merchant_received.astype(\'float\')
user_merchant3[\'user_merchant_coupon_buy_rate\'] = user_merchant3.user_merchant_buy_use_coupon.astype(\'float\')/user_merchant3.user_merchant_buy_total.astype(\'float\')
user_merchant3[\'user_merchant_rate\'] = user_merchant3.user_merchant_buy_total.astype(\'float\')/user_merchant3.user_merchant_any.astype(\'float\')
user_merchant3[\'user_merchant_common_buy_rate\'] = user_merchant3.user_merchant_buy_common.astype(\'float\')/user_merchant3.user_merchant_buy_total.astype(\'float\')
user_merchant3.to_csv(\'feature/user_merchant3.csv\',index=None)


# 对于数据集feature2
all_user_merchant = feature2[[\'user_id\',\'merchant_id\']]
all_user_merchant.drop_duplicates(inplace=True)
t = feature2[[\'user_id\',\'merchant_id\',\'date\']]
t = t[t.date!=\'null\'][[\'user_id\',\'merchant_id\']]
t[\'user_merchant_buy_total\'] = 1
t = t.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t.drop_duplicates(inplace=True)
t1 = feature2[[\'user_id\',\'merchant_id\',\'coupon_id\']]
t1 = t1[t1.coupon_id!=\'null\'][[\'user_id\',\'merchant_id\']]
t1[\'user_merchant_received\'] = 1
t1 = t1.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t1.drop_duplicates(inplace = True)

t2 = feature2[[\'user_id\',\'merchant_id\',\'date\',\'date_received\']]
t2 = t2[(t2.date!=\'null\')&(t2.date_received!=\'null\')][[\'user_id\',\'merchant_id\']]
t2[\'user_merchant_buy_use_coupon\'] = 1
t2 = t2.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t2.drop_duplicates(inplace=True)

t3 = feature2[[\'user_id\',\'merchant_id\']]
t3[\'user_merchant_any\'] = 1
t3 = t3.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t3.drop_duplicates(inplace=True)

t4 = feature2[[\'user_id\',\'merchant_id\',\'date\',\'coupon_id\']]
t4 = t4[(t4.date!=\'null\')&(t4.coupon_id == \'null\')][[\'user_id\',\'merchant_id\']]
t4[\'user_merchant_buy_common\'] = 1
t4 = t4.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t4.drop_duplicates(inplace=True)

user_merchant2 = pd.merge(all_user_merchant,t,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant2 = pd.merge(user_merchant2,t1,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant2 = pd.merge(user_merchant2,t2,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant2 = pd.merge(user_merchant2,t3,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant2 = pd.merge(user_merchant2,t4,on=[\'user_id\',\'merchant_id\'],how=\'left\')

user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(np.nan,0)
user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(np.nan,0)

user_merchant2[\'user_merchant_coupon_transfer_rate\'] = user_merchant2.user_merchant_buy_use_coupon.astype(\'float\')/user_merchant2.user_merchant_received.astype(\'float\')
user_merchant2[\'user_merchant_coupon_buy_rate\'] = user_merchant2.user_merchant_buy_use_coupon.astype(\'float\')/user_merchant2.user_merchant_buy_total.astype(\'float\')
user_merchant2[\'user_merchant_rate\'] = user_merchant2.user_merchant_buy_total.astype(\'float\')/user_merchant2.user_merchant_any.astype(\'float\')
user_merchant2[\'user_merchant_common_buy_rate\'] = user_merchant2.user_merchant_buy_common.astype(\'float\')/user_merchant2.user_merchant_buy_total.astype(\'float\')
user_merchant2.to_csv(\'feature/user_merchant2.csv\',index=None)


# 对于数据集feature1
all_user_merchant = feature1[[\'user_id\',\'merchant_id\']]
all_user_merchant.drop_duplicates(inplace=True)
t = feature1[[\'user_id\',\'merchant_id\',\'date\']]
t = t[t.date!=\'null\'][[\'user_id\',\'merchant_id\']]
t[\'user_merchant_buy_total\'] = 1
t = t.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t.drop_duplicates(inplace=True)
t1 = feature1[[\'user_id\',\'merchant_id\',\'coupon_id\']]
t1 = t1[t1.coupon_id!=\'null\'][[\'user_id\',\'merchant_id\']]
t1[\'user_merchant_received\'] = 1
t1 = t1.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t1.drop_duplicates(inplace = True)

t2 = feature1[[\'user_id\',\'merchant_id\',\'date\',\'date_received\']]
t2 = t2[(t2.date!=\'null\')&(t2.date_received!=\'null\')][[\'user_id\',\'merchant_id\']]
t2[\'user_merchant_buy_use_coupon\'] = 1
t2 = t2.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t2.drop_duplicates(inplace=True)

t3 = feature1[[\'user_id\',\'merchant_id\']]
t3[\'user_merchant_any\'] = 1
t3 = t3.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t3.drop_duplicates(inplace=True)

t4 = feature1[[\'user_id\',\'merchant_id\',\'date\',\'coupon_id\']]
t4 = t4[(t4.date!=\'null\')&(t4.coupon_id == \'null\')][[\'user_id\',\'merchant_id\']]
t4[\'user_merchant_buy_common\'] = 1
t4 = t4.groupby([\'user_id\',\'merchant_id\']).agg(\'sum\').reset_index()
t4.drop_duplicates(inplace=True)

user_merchant1 = pd.merge(all_user_merchant,t,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant1 = pd.merge(user_merchant1,t1,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant1 = pd.merge(user_merchant1,t2,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant1 = pd.merge(user_merchant1,t3,on=[\'user_id\',\'merchant_id\'],how=\'left\')
user_merchant1 = pd.merge(user_merchant1,t4,on=[\'user_id\',\'merchant_id\'],how=\'left\')

user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(np.nan,0)
user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(np.nan,0)

user_merchant1[\'user_merchant_coupon_transfer_rate\'] = user_merchant1.user_merchant_buy_use_coupon.astype(\'float\')/user_merchant1.user_merchant_received.astype(\'float\')
user_merchant1[\'user_merchant_coupon_buy_rate\'] = user_merchant1.user_merchant_buy_use_coupon.astype(\'float\')/user_merchant1.user_merchant_buy_total.astype(\'float\')
user_merchant1[\'user_merchant_rate\'] = user_merchant1.user_merchant_buy_total.astype(\'float\')/user_merchant1.user_merchant_any.astype(\'float\')
user_merchant1[\'user_merchant_common_buy_rate\'] = user_merchant1.user_merchant_buy_common.astype(\'float\')/user_merchant1.user_merchant_buy_total.astype(\'float\')
user_merchant1.to_csv(\'feature/user_merchant1.csv\',index=None)

  三、特征组合

# dataset1,2,3分别是划分时间滑窗后的所提取的特征的组合,方便接下来的划分训练集和测试集
# 此次合并后dataset1,2,3的特征类型是一样的,然后给dataset1,2添加标签,标签是get_label(s)这个函数所生成的
def get_label(s):
    s = s.split(\':\')
    if s[0]==\'null\':
        return 0
    elif (dt.date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-dt.date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<15:
        return 1
    else:
        return -1

coupon3 = pd.read_csv(\'feature/coupon3_feature.csv\')
merchant3 = pd.read_csv(\'feature/merchant3_feature.csv\')
user3 = pd.read_csv(\'feature/user3_feature.csv\')
other_feature3 = pd.read_csv(\'feature/other_feature3.csv\')
user_merchant3 = pd.read_csv(\'feature/user_merchant3.csv\')
dataset3 = pd.merge(coupon3,merchant3,on=\'merchant_id\',how=\'left\')
dataset3 = pd.merge(dataset3,user3,on=\'user_id\',how=\'left\')
dataset3 = pd.merge(dataset3,user_merchant3,on=[\'user_id\',\'merchant_id\'],how=\'left\')
dataset3 = pd.merge(dataset3,other_feature3,on=[\'user_id\',\'coupon_id\',\'date_received\'],how=\'left\')
dataset3.drop_duplicates(inplace=True)

dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(np.nan,0)
dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan,0)
dataset3.user_merchant_received = dataset3.user_merchant_received.replace(np.nan,0)
dataset3[\'is_weekend\'] = dataset3.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
# get_dummies 进行one-hot编码
weekday_dummies = pd.get_dummies(dataset3.day_of_week)
weekday_dummies.columns = [\'weekday\'+str(i+1) for i in range(weekday_dummies.shape[1])]
dataset3 = pd.concat([dataset3,weekday_dummies],axis=1)
# dataset3.columns
dataset3.drop([\'merchant_id\',\'day_of_week\',\'coupon_count\'],axis=1,inplace=True)
dataset3 = dataset3.replace(\'null\',np.nan)
dataset3.to_csv(\'dataset/dataset3.csv\',index=None)

coupon2 = pd.read_csv(\'feature/coupon2_feature.csv\')
merchant2 = pd.read_csv(\'feature/merchant2_feature.csv\')
user2 = pd.read_csv(\'feature/user2_feature.csv\')
user_merchant2 = pd.read_csv(\'feature/user_merchant2.csv\')
other_feature2 = pd.read_csv(\'feature/other_Feature2.csv\')
dataset2 = pd.merge(coupon2,merchant2,on=\'merchant_id\',how=\'left\')
dataset2 = pd.merge(dataset2,user2,on=\'user_id\',how=\'left\')
dataset2 = pd.merge(dataset2,user_merchant2,on=[\'user_id\',\'merchant_id\'],how=\'left\')
dataset2 = pd.merge(dataset2,other_feature2,on=[\'user_id\',\'coupon_id\',\'date_received\'],how=\'left\')
dataset2.drop_duplicates(inplace=True)
# dataset2.head()

dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(np.nan,0)
dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan,0)
dataset2.user_merchant_received = dataset2.user_merchant_received.replace(np.nan,0)
dataset2[\'is_weekend\'] = dataset2.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
weekday_dummies = pd.get_dummies(dataset2.day_of_week)
weekday_dummies.columns = [\'weekday\'+str(i+1) for i in range(weekday_dummies.shape[1])]
dataset2 = pd.concat([dataset2,weekday_dummies],axis=1)
dataset2[\'label\'] = dataset2.date.astype(\'str\') + \':\' +  dataset2.date_received.astype(\'str\')
dataset2.label = dataset2.label.apply(get_label)
dataset2.drop([\'merchant_id\',\'day_of_week\',\'date\',\'date_received\',\'coupon_id\',\'coupon_count\'],axis=1,inplace=True)
dataset2 = dataset2.replace(\'null\',np.nan)
dataset2.to_csv(\'dataset/dataset2.csv\',index=None)


coupon1 = pd.read_csv(\'feature/coupon1_feature.csv\')
merchant1 = pd.read_csv(\'feature/merchant1_feature.csv\')
user1 = pd.read_csv(\'feature/user1_feature.csv\')
user_merchant1 = pd.read_csv(\'feature/user_merchant1.csv\')
other_feature1 = pd.read_csv(\'feature/other_feature1.csv\')
dataset1 = pd.merge(coupon1,merchant1,on=\'merchant_id\',how=\'left\')
dataset1 = pd.merge(dataset1,user1,on=\'user_id\',how=\'left\')
dataset1 = pd.merge(dataset1,user_merchant1,on=[\'user_id\',\'merchant_id\'],how=\'left\')
dataset1 = pd.merge(dataset1,other_feature1,on=[\'user_id\',\'coupon_id\',\'date_received\'],how=\'left\')
dataset1.drop_duplicates(inplace=True)
# print dataset1.shape

dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(np.nan,0)
dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan,0)
dataset1.user_merchant_received = dataset1.user_merchant_received.replace(np.nan,0)
dataset1[\'is_weekend\'] = dataset1.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
weekday_dummies = pd.get_dummies(dataset1.day_of_week)
weekday_dummies.columns = [\'weekday\'+str(i+1) for i in range(weekday_dummies.shape[1])]
dataset1 = pd.concat([dataset1,weekday_dummies],axis=1)
dataset1[\'label\'] = dataset1.date.astype(\'str\') + \':\' +  dataset1.date_received.astype(\'str\')
dataset1.label = dataset1.label.apply(get_label)
dataset1.drop([\'merchant_id\',\'day_of_week\',\'date\',\'date_received\',\'coupon_id\',\'coupon_count\'],axis=1,inplace=True)
dataset1 = dataset1.replace(\'null\',np.nan)
dataset1.to_csv(\'dataset/dataset1.csv\',index=None)

  四、模型训练

import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler

dataset1 = pd.read_csv(\'dataset/dataset1.csv\')
dataset1.label.replace(-1,0,inplace=True)
dataset2 = pd.read_csv(\'dataset/dataset2.csv\')
dataset2.label.replace(-1,0,inplace=True)
dataset3 = pd.read_csv(\'dataset/dataset3.csv\')
# 去重
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)

dataset12 = pd.concat([dataset1,dataset2],axis=0)
# 再次组合成训练集
dataset1_y = dataset1.label
dataset1_x = dataset1.drop([\'user_id\',\'label\',\'day_gap_before\',\'day_gap_after\'],axis=1)  # \'day_gap_before\',\'day_gap_after\' cause overfitting, 0.77
dataset2_y = dataset2.label
dataset2_x = dataset2.drop([\'user_id\',\'label\',\'day_gap_before\',\'day_gap_after\'],axis=1)
dataset12_y = dataset12.label
dataset12_x = dataset12.drop([\'user_id\',\'label\',\'day_gap_before\',\'day_gap_after\'],axis=1)
dataset3_preds = dataset3[[\'user_id\',\'coupon_id\',\'date_received\']]
dataset3_x = dataset3.drop([\'user_id\',\'coupon_id\',\'date_received\',\'day_gap_before\',\'day_gap_after\'],axis=1)
# dataset3_x = dataset3.drop([\'user_id\',\'coupon_id\',\'date_received\'],axis=1)

# print(dataset1_x.shape,dataset2_x.shape,dataset3_x.shape)

dataset1 = xgb.DMatrix(dataset1_x,label=dataset1_y)
dataset2 = xgb.DMatrix(dataset2_x,label=dataset2_y)
dataset12= xgb.DMatrix(dataset12_x,label=dataset12_y)
dataset3 = xgb.DMatrix(dataset3_x)
# 在XGBoost中,要将处理的数据存储在对象DMatrix中,方便下一步处理

  对特征筛选,训练,方便除去对标签影响因子小的特征属性,即剪枝

params={\'booster\':\'gbtree\',
	    \'objective\': \'rank:pairwise\',
	    \'eval_metric\':\'auc\',
	    \'gamma\':0.1,
	    \'min_child_weight\':1.1,
	    \'max_depth\':5,
	    \'lambda\':10,
	    \'subsample\':0.7,
	    \'colsample_bytree\':0.7,
	    \'colsample_bylevel\':0.7,
	    \'eta\': 0.01,
	    \'tree_method\':\'exact\',
	    \'seed\':0,
	    \'nthread\':12
	    }
watchlist = [(dataset12,\'train\')]
# 模型训练 model = xgb.train(params,dataset12,num_boost_round=3500,evals=watchlist) # 对dataset3进行预测 dataset3_preds[\'label\'] = model.predict(dataset3) dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1)) dataset3_preds.sort_values(by=[\'coupon_id\',\'label\'],inplace=True) dataset3_preds.to_csv("xgb_preds.csv",index=None,header=None) # print(dataset3_preds.describe()) # feature_score来保存特征对标签的影响因子 feature_score = model.get_fscore() feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True) fs = [] for (key,value) in feature_score: fs.append("{0},{1}\n".format(key,value)) with open(\'xgb_feature_score.csv\',\'w\') as f: f.writelines("feature,score\n") f.writelines(fs)

  

  五、总结

  本次按着大佬的思路做了一次,感觉对自己的提升挺大的,学会了好多东西,总结下自己这段时间的工作吧,弄懂了一个完整的数据处理过程到底在干什么、怎么弄、以及锻炼了自己敲代码的能力,个人感觉真是在课本学习和实际操作过程中差距还是挺大的,实际操作下,学东西会更快,以后要多多参加这种竞赛,看看别人的想法,我这算是入门级的了。

  六、附录

  这些代码都在Jupyter Notebook上完美运行,所涉及的知识点也做了一些笔记整理,具体详见我的其他随笔。

  最后,感谢第一名大佬提供的源码。