1.需求说明

在生成样本集的决策树过程中，在得到树根节点后，后面还需要选择子树的根节点，这时候不能使用前面的样本集了，需要将已选作根节点的样本特征从样本集中裁减掉，为了便于理解，下面举例说明：

样本特征列表：

feature_type_list = ['youth','work','house','credit']

样本集：

samples_list = [ ['youth', 'work_no', 'house_no', '1', 'refuse']
                 ['youth', 'work_no', 'house_no', '2', 'refuse']
                 ['youth', 'work_yes', 'house_no', '2', 'agree']
                 ['youth', 'work_yes', 'house_yes', '1', 'agree']
                 ['youth', 'work_no', 'house_no', '1', 'refuse']
                 ['mid', 'work_no', 'house_no', '1', 'refuse']
                 ['mid', 'work_no', 'house_no', '2', 'refuse']
                 ['mid', 'work_yes', 'house_yes', '2', 'agree']
                 ['mid', 'work_no', 'house_yes', '3', 'agree']
                 ['mid', 'work_no', 'house_yes', '3', 'agree']
                 ['elder', 'work_no', 'house_yes', '3', 'agree']
                 ['elder', 'work_no', 'house_yes', '2', 'agree']
                 ['elder', 'work_yes', 'house_no', '2', 'agree']
                 ['elder', 'work_yes', 'house_no', '3', 'agree']
                 ['elder', 'work_no', 'house_no', '1', 'refuse'] ]

已经通过信息增益选出此样本集的决策树的最优根节点为特征house。

现在要选则子树的最优根节点，则需要更新样本集和样本特征列表。

需要将已选作根节点的样本特house征从样本集以及特征列表中裁减掉

通过如下规则得到新的样本集

step1:删除特征列表中的house
new_feature_type_list = ['youth','work','house','credit']

step2:删除hourse特征值为house_yes所在的所有行并删除样本集中house特征值列
new_samples_list = [ ['youth', 'work_no', '1', 'refuse']
                     ['youth', 'work_no', '2', 'refuse']
                     ['youth', 'work_yes', '2', 'agree']
                     ['youth', 'work_no', '1', 'refuse']
                     ['mid', 'work_no', '1', 'refuse']
                     ['mid', 'work_no', '2', 'refuse']
                     ['elder', 'work_yes', '2', 'agree']
                     ['elder', 'work_yes', '3', 'agree']
                     ['elder', 'work_no', '1', 'refuse'] ]

本章的代码就是实现以上样本集的裁剪和特征列表的裁剪，供子决策树的根节点选择使用

2.代码

说明：裁剪工作主要由def tailer_work(self)方法来完成~~

# -*- coding: utf-8 -*-
"""
@author: 蔚蓝的天空Tom
Aim:在决策树种，在得到一个树根节点后，需要选择子树的根节点，这时候就需要对样本集进行裁剪
Aim:裁剪规则详见下面
"""

import numpy as np

'''Tool Function'''
varnamestr = lambda v,nms: [ vn for vn in nms if id(v)==id(nms[vn])][0]

#============================================
class CUtileTool(object):
    '''
    提供有用的方法
    比如dump_list方法，可以打印给定的list的相关信息
    '''
    def dump_list(self, src_list, src_list_namestr):
        '''
        逐行打印list
        :param self:类实例自身
        :param src_list:被打印的源list
        :return 无
        '''
        print('\n============',src_list_namestr,'================')
        list_len = len(src_list)
        list_shape = np.shape(src_list)
        print('type(',src_list_namestr,'):',type(src_list))  #<class 'list'>
        print('np.shape(',src_list_namestr,'):',np.shape(src_list))
        if 1 == len(list_shape):
            print(src_list)
        elif 2 == len(list_shape):
            for i in range(list_len):
                if 0 == i:
                    print('[',src_list[i])
                elif (list_len - 1) == i:
                    print(src_list[i],']')
                else:
                    print(src_list[i])
        else:
            print(src_list)
        print('======\n')
        return
 
    def dump_array(self, src_a, src_dict_namestr):
        '''
        逐行打印array
        :param self:类实例自身
        :param src_a:被打印的源array
        :return 无
        '''
        print('\n===============',src_dict_namestr,'===================')
        a_len = len(src_a)
        a_shape = np.shape(src_a)
        print('type(',src_dict_namestr,'):',type(src_a))  #<class 'list'>
        print('np.shape(',src_dict_namestr,'):',np.shape(src_a))
        if 1 == len(a_shape):
            print(src_a)
        elif 2 == len(a_shape):
            for i in range(a_len):
                if 0 == i:
                    print('[',src_a[i])
                elif (a_len - 1) == i:
                    print(src_a[i],']')
                else:
                    print(src_a[i])
        else:
            print(src_a)
        print('======\n')
        return

    def print_dict(self, src_dict, level, src_dict_namestr=''):
        '''
        逐行打印dict
        :param self:类实例自身
        :param src_dict:被打印的dict
        :param level:递归level，初次调用为level=0
        :param src_dict_namestr:对象变量名称字符串
        '''
        if isinstance(src_dict, dict):
            tab_str = '\t'
            for i in range(level):
                tab_str += '\t'
            if 0 == level:
                print(src_dict_namestr,'= {')
            for key, value in src_dict.items():
                if isinstance(value, dict):
                    has_dict = False
                    for k,v in value.items():
                        if isinstance(v, dict):
                            has_dict = True
                    if has_dict:
                        print(tab_str,key,":{")
                        self.print_dict(value, level + 1)
                    else:
                        print(tab_str,key,':',value)
                else:
                    print(tab_str,key,': ',value,)
            print(tab_str,'}')

    def dump_dict(self, src_dict, src_dict_namestr):
        '''
        逐行打印dict
        :param self:类实例自身
        :param src_dict:被打印的dict对象
        :return 无
        '''
        print('\n===============',src_dict_namestr,'===================')
        dict_len = len(src_dict)
        dict_shape = np.shape(src_dict)
        dict_type = type(src_dict)
        print('len(',src_dict_namestr,'):',dict_len)
        print('type(',src_dict_namestr,'):', dict_type)  #<class 'dict'>
        print('np.shape(',src_dict_namestr,'):', dict_shape)
        print('len(dict_shape):', len(dict_shape))
        
        self.print_dict(src_dict, 0, src_dict_namestr)
        print('======\n')
        return
        
    def dump(self, src_thing, src_thing_namestr):
        '''
        逐行打印变量(list, array, matrix等)
        :param self：类实例自身
        :param src_things：被打印者
        :return 无
        '''
        if isinstance(src_thing, list):
            return self.dump_list(src_thing, src_thing_namestr)
        elif isinstance(src_thing, np.ndarray):
            return self.dump_array(src_thing, src_thing_namestr)
        elif isinstance(src_thing, dict):
            return self.dump_dict(src_thing, src_thing_namestr)
        else:
            print(src_thing_namestr,':', src_thing)
        return
#===========================================


'''
裁剪规格简介

#每个样本example的特征列表
feature_type_list = ['youth','work','hourse','credit']

即每个样本=[age_value, work_value, housr_value, crdit_value, class_label]

如下一个样本集：
samples_list = [ ['youth', 'work_no', 'house_no', '1', 'refuse']
                 ['youth', 'work_no', 'house_no', '2', 'refuse']
                 ['youth', 'work_yes', 'house_no', '2', 'agree']
                 ['youth', 'work_yes', 'house_yes', '1', 'agree']
                 ['youth', 'work_no', 'house_no', '1', 'refuse']
                 ['mid', 'work_no', 'house_no', '1', 'refuse']
                 ['mid', 'work_no', 'house_no', '2', 'refuse']
                 ['mid', 'work_yes', 'house_yes', '2', 'agree']
                 ['mid', 'work_no', 'house_yes', '3', 'agree']
                 ['mid', 'work_no', 'house_yes', '3', 'agree']
                 ['elder', 'work_no', 'house_yes', '3', 'agree']
                 ['elder', 'work_no', 'house_yes', '2', 'agree']
                 ['elder', 'work_yes', 'house_no', '2', 'agree']
                 ['elder', 'work_yes', 'house_no', '3', 'agree']
                 ['elder', 'work_no', 'house_no', '1', 'refuse'] ]


假设已经通过信息增益选出此样本集的决策树的最优根节点为特征housre

如果想求子决策树的最优根节点的话，就需要对原始样本集进行裁剪了，然后用新的样本集筛选新的最优根节点

#通过如下规则得到新的样本集
step1:删除hourse特征值为house_yes所在的所有行
step2:然后再删除hourse特征值列
'''
class CTailorSamples(object):
    '''裁剪样本集'''
    def __init__(self, data_list, feat_type_list, feat_type_index, feat_value):
        self.data_list              = data_list
        self.feat_type_list         = feat_type_list
        self.feat_type_index_tailed = feat_type_index
        self.feat_value_tailed      = feat_value
        
        #裁剪
        self.tailer_work()
        
    def get_samples(self):
        '''
        返回裁剪后的样本集，特征类型列表
        '''
        return self.data_list, self.feat_type_list
        
    def get_all_indexs(self, src_list, dst_value):  
        '''
        返回给定值的所有元素的下标
        src_list = [10,20,30,30,30,50]
        e = 30
        indexs_list = tailor.get_all_indexs(src_list, e)
        print(indexs_list) #[2, 3, 4]
        '''
        dst_val_index = [i for i,x in enumerate(src_list) if x == dst_value]
        return dst_val_index

    def tailer_work(self):
        '''裁剪得到新的特征列表'''
        del self.feat_type_list[self.feat_type_index_tailed]


        '''裁剪数据集'''
        #摘取被删除的特征列
        colum_to_del = self.feat_type_index_tailed
        self.feat_value_list = [example[colum_to_del] for example in self.data_list]
        
        #找出含有self.feat_value_tailed特征值的所有样本所在行的下标
        rows_to_del = self.get_all_indexs(self.feat_value_list, self.feat_value_tailed) 
        
        #删除row_index_list中行下标对应的self.src_data_list的行
        #技巧：从大的行下标开始依次删除
        #for row in list(reversed(rows_to_del)):
        #for row in rows_to_del[::-1]:
        rows_to_del.reverse()
        for row in rows_to_del:
            del self.data_list[row]
        
        #删除给定的特征列
        for row in range(len(self.data_list)):
            del self.data_list[row][colum_to_del]

        return self.data_list, self.feat_type_list
        

#测试&使用步骤展示
def CTailorSamples_manual(): 
    #样本集
    data_list = [['youth', 'work_no', 'house_no', '1', 'refuse'],
               ['youth', 'work_no', 'house_no', '2', 'refuse'],
               ['youth', 'work_yes', 'house_no', '2', 'agree'],
               ['youth', 'work_yes', 'house_yes', '1', 'agree'],
               ['youth', 'work_no', 'house_no', '1', 'refuse'],
               ['mid', 'work_no', 'house_no', '1', 'refuse'],
               ['mid', 'work_no', 'house_no', '2', 'refuse'],
               ['mid', 'work_yes', 'house_yes', '2', 'agree'],
               ['mid', 'work_no', 'house_yes', '3', 'agree'],
               ['mid', 'work_no', 'house_yes', '3', 'agree'],
               ['elder', 'work_no', 'house_yes', '3', 'agree'],
               ['elder', 'work_no', 'house_yes', '2', 'agree'],
               ['elder', 'work_yes', 'house_no', '2', 'agree'],
               ['elder', 'work_yes', 'house_no', '3', 'agree'],
               ['elder', 'work_no', 'house_no', '1', 'refuse']]
    #样本特征列表
    feat_type_list = ['age', 'working', 'house', 'credit']
    
    #需求1)删除dataSet中含有house_yes的所有样本行
    feat_value_to_del = 'house_yes'
    #需求2)以及删除dataSet中的house特征列
    #需求3)以及删除样本特征列表中的特征'house'
    feat_type_index = feat_type_list.index('house')
    
    #创建裁剪类CTailorSamples的实例
    tailor = CTailorSamples(data_list, feat_type_list, feat_type_index, feat_value_to_del)
    
    #获取裁剪后的样本集，样本特征列表
    new_data_list, new_feat_type_list = tailor.get_samples()
    
    #打印显示验证，看看是否裁剪正确
    tool = CUtileTool()
    tool.dump(new_data_list, 'new_data_list')
    tool.dump(new_feat_type_list,'new_feat_type_list') 

if __name__=='__main__':
    CTailorSamples_manual()

3.执行结果

============ new_data_list ================
type( new_data_list ): <class 'list'>
np.shape( new_data_list ): (9, 4)
[ ['youth', 'work_no', '1', 'refuse']
['youth', 'work_no', '2', 'refuse']
['youth', 'work_yes', '2', 'agree']
['youth', 'work_no', '1', 'refuse']
['mid', 'work_no', '1', 'refuse']
['mid', 'work_no', '2', 'refuse']
['elder', 'work_yes', '2', 'agree']
['elder', 'work_yes', '3', 'agree']
['elder', 'work_no', '1', 'refuse'] ]
======

============ new_feat_type_list ================
type( new_feat_type_list ): <class 'list'>
np.shape( new_feat_type_list ): (3,)
['age', 'working', 'credit']
======

enjoy it~

(end)

秒客网

【机器学习】【决策树】自己动手用python实现样本集的裁剪，新样本集以供计算子决策树的最优根节点

1.需求说明

3.执行结果

相关文章