import pandas as pd
import numpy as np
import json
df = pd.DataFrame(pd.read_csv("西瓜数据集3.0.csv", encoding="gbk"))
df.drop(labels=["编号"], axis=1, inplace=True)
df["好瓜"].replace(to_replace=["是", "否"], value=["好瓜", "坏瓜"], inplace=True)
featureList = df.columns[:-1]
featureValue = {}
for feature in featureList[:-2]:
featureValue[feature] = set(df[feature])
T = {} # 候选点集合
for feature in featureList[-2:]: # 连续属性
T1 = df[feature].sort_values()
T2 = T1[:-1].reset_index(drop=True)
T3 = T1[1:].reset_index(drop=True)
T[feature] = (T2+T3)/2
def Ent(D):
frequency = D["好瓜"].value_counts()/len(D["好瓜"])
return -sum(pk*np.log2(pk) for pk in frequency)
def split_discrete(D, feature):
splitD = []
for Dv in D.groupby(by=feature, axis=0):
splitD.append(Dv)
return splitD
def split_continues(D, feature, splitValue):
splitD = []
splitD.append(D[D[feature] <= splitValue])
splitD.append(D[D[feature] > splitValue])
return splitD
def Gain_discrete(D, feature):
gain = Ent(D) - sum(len(Dv[1])/len(D)*Ent(Dv[1]) for Dv in split_discrete(D, feature))
return gain
def Gain_continues(D, feature):
_max = 0
splitValue = 0
for t in T[feature].values: # 尝试各个划分点,并取可以使增益最大的划分点
temp = Ent(D) - sum(len(Dv)/len(D)*Ent(Dv) for Dv in split_continues(D, feature, t))
if _max < temp:
_max = temp
splitValue = t
return _max, splitValue
def chooseBestFeature(D, A):
informationGain = {}
for feature in A:
if feature in ["密度", "含糖率"]: # 密度和含糖率是连续属性
ig, splitValue = Gain_continues(D, feature)
informationGain[feature+"<=%.3f"%splitValue] = ig
else:
informationGain[feature] = Gain_discrete(D, feature)
informationGain = sorted(informationGain.items(), key=lambda ig:ig[1], reverse=True)
return informationGain[0][0]
def countMajority(D): # mode()求出现次数最多的元素,iloc取得对应的类:好瓜或坏瓜(是或否)
return D["好瓜"].mode().iloc[0]
def treeGenerate(D, A):
if len(split_discrete(D, "好瓜")) == 1:
return D["好瓜"].iloc[0]
if len(A) == 0 or len(split_discrete(D, A.tolist())) == 1:
return countMajority(D)
bestFeature = chooseBestFeature(D, A)
if "<=" in bestFeature: # 连续属性
bestFeature, splitValue = bestFeature.split("<=")
myTree = {bestFeature+"<="+splitValue:{}}
[D0, D1] = split_continues(D, bestFeature, float(splitValue))
A0 = pd.Index(A)
A1 = pd.Index(A)
myTree[bestFeature+"<="+splitValue]["yes"] = treeGenerate(D0, A0)
myTree[bestFeature+"<="+splitValue]["no"] = treeGenerate(D1, A1)
else:
myTree = {bestFeature:{}}
for bestFeatureValue, Dv in split_discrete(D, bestFeature):
if len(Dv) == 0:
return countMajority(D)
else:
A2 = pd.Index(A)
A2 = A2.drop([bestFeature])
Dv = Dv.drop(labels=[bestFeature], axis=1)
myTree[bestFeature][bestFeatureValue] = treeGenerate(Dv, A2)
return myTree
if __name__ == "__main__":
myTree = treeGenerate(df, featureList)
myTree = json.dumps(myTree, indent=2, ensure_ascii=False, separators=(',', ':'))
print(myTree)