机器学习之使用Apriori算法进行关联分析

时间:2021-12-02 16:50:02

    本文主要记录本人在学习机器学习过程中的相关代码实现,参考《机器学习实战》

from numpy import *
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

def createC1(dataSet):
C1=[]
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset,C1))#frozenset是指被“冰冻 ”的集合, 就是说它们是不可改变的,艮口用户不能修改它们

#数据集Ck,包含候选集合的列表以及感兴趣项集的最小支持度minSupport
def scanD(D,Ck,minSupport):
ssCnt={}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not can in ssCnt: ssCnt[can]=1
else: ssCnt[can]+=1
numItems=float(len(D))
retList=[]
supportData={}
for key in ssCnt:
support=ssCnt[key]/numItems
if support>=minSupport:
retList.insert(0,key)
supportData[key]=support
return retList,supportData

#~ dataSet=loadDataSet()
#~ C1=createC1(dataSet)
#~ print(C1)
#~ D=list(map(set,dataSet))
#~ L1,suppData0=scanD(D,C1,0.5)
#~ print(L1)

#CaprioriGen () 的输人参数为频繁项集列表lk与项集元素个数k, 输出为ck
def aprioriGen(Lk,k):
retList=[]
lenLk=len(Lk)
for i in range(lenLk):
for j in range(i+1,lenLk):
L1=list(Lk[i])[:k-2]
L2=list(Lk[j])[:k-2]
L1.sort();L2.sort()
if L1==L2:
retList.append(Lk[i] | Lk[j])
return retList

def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
D = list(map(set, dataSet))
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriGen(L[k-2], k)
Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData

#~ dataSet=loadDataSet()
#~ L,suppData=apriori(dataSet)
#~ print(L)
#~ print(aprioriGen(L[0],2))

#函数generateRules()有3个参数:频繁项集列表、包含那些频繁项集支持数据的字典、最小可信度阈值
def generateRules(L,supportData,minConf=0.7):
bigRuleList=[]
for i in range(1,len(L)):
for freqSet in L[i]:
H1=[frozenset([item]) for item in freqSet]
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] #create new list to return
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
if conf >= minConf:
print(freqSet-conseq,'-->',conseq,'conf:',conf)
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])#H中第一个元素(任意一个元素)的长度
print('m=',m)
#生成式的右部可能包含两个或多个元素,如果从集合 {0,1,2,3}开始,那么H1应该是,[{0},{1},{2},{3}]
#~ 如果频繁项集的元素数目超过 2 ,那么会考虑对它做进一步的合并
#~ 右部可能出现{0,1},{0,2},{0,1,2}
if (len(freqSet) > (m + 1)): #try further merging
Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

#~ dataSet=loadDataSet()
#~ L,suppData=apriori(dataSet,minSupport=0.5)
#~ rules=generateRules(L,suppData,minConf=0.5)
#~ print(rules)

mushDatSet=[line.split() for line in open('mushroom.dat').readlines()]
L,suppData=apriori(mushDatSet,minSupport=0.3)
for item in L[2]:
if item.intersection('2'): print(item)