机器学习实战读书笔记-决策树

时间:2022-12-20 11:33:31

决策树

以下是需要的一些子函数

# 计算给定数据集的香农熵

def calcShannonEnt(dataSet):
numEntries = len(dataSet) #得到行数
labeXCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1] #获取每行最后一列的值
#根据最后一列的值进行分类,并计算每一类的个数
if currentLabel not in labeXCounts.keys():
labeXCounts[currentLabel] = 0
labeXCounts[currentLabel] += 1
shannonEnt = 0.0
#计算香农熵
for key in labeXCounts:
prob = float (labeXCounts [key] )/numEntries
shannonEnt -= prob * log(prob,2)
return shannonEnt
# 选择最好的数据集划分方式

def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet) #获取数据集的香农熵
bestInfoGain = 0
bestFeature = -1

#从1 - (len-1)循环进行数据集划分,比较并得到最好的划分(注意,因为是结果特征,最后一列不进行划分)
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i

return bestFeature
# 按照给定特征划分数据集

def splitDataSet(dataSet, axis, value):
# axis 表示第几列,value表示特征值。eg:axis =3 ,value = "kk" 表示,第三列中值为"kk"的,把这些行取出来(去掉第3列)
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 获取给定第单列集中最多的分类
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
#创建树的函数代码
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet] #获取最后一列,结果特征
if classList.count(classList[0]) == len(classList): #如果这一列的特征都一样,则直接返回该特征值
return classList[0]
if len(dataSet[0]) == 1: #如果递归到只剩下一列,还没分出类别,那么取剩下类别中最多的返回
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) #选择最好的特征列
bestFeatLabel = labels[bestFeat] #最后的特征列对应的标签(注意,这个不是结果值,而是每一列的标签,含义,不包含在dataSet中,比dataSet少一列,因为结果那一列不需要定义标签)
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)

return myTree

以下为最终算法代码,同样的。也是递归(树相关经常会使用递归)

# 使用决策树的分类函数

def TreeClassify(inputTree,featLabels,testVec):
firstStr = inputTree.keys()[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__== 'dict':
classLabel = TreeClassify(secondDict[key], featLabels, testVec)
else: classLabel = secondDict[key]
return classLabel

构造决策树是很耗时的任务,即使处理很小的数据集。如果数据集很大,将会耗费很多计算时间。然而用创建好的决策树解决分类问题,则何以 很快完成。因此,为了节省计算时间,最好能够在每次执行分类时调用巳经构造好的决策树。为 了解决这个问题,需要使用Python模块pickle序列化对象。序列化对象可以在磁 盘上保存对象,并在需要的时候读取出来。任何对象都可以执行序列化操作,字典对象也不例外。

#存储决策树
def storeTree(inputTree,filename):
import pickle
fw = open(filename,'w')
pickle.dump(inputTree,fw)
fw.close()

def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)