基于用户相似性的协同过滤——Python实现

时间:2022-08-23 04:19:24

代码基本来自项亮的<推荐系统实践>,把书上的伪代码具体实现,还参考了https://www.douban.com/note/336280497/

还可以加入对用户相似性的归一化操作,效果会更好。

数据集为MovieLens的10万条数据.
链接:MoiveLens

#coding:utf-8
import random,math
from operator import itemgetter class UserBasedCF:
def __init__(self,trainDataFile=None,testDataFile=None,splitor='\t'):
if trainDataFile!=None:
self.train=self.loadData(trainDataFile, splitor)
if testDataFile!=None:
self.test=self.loadData(testDataFile, splitor)
self.simiMatrix={} def setData(self,train,test):
self.train=train
self.test=test def loadData(self,dataFile,splitor='\t'):
data={}
for line in open(dataFile):
user,item,record,_ = line.split()
data.setdefault(user,{})
data[user][item]=record
return data def recallAndPrecision(self,peersCount,topN=10):
hit=0
recall=0
precision=0
for user in self.train.keys():
itemOfuser=self.test.get(user,{})
recItems=self.recommend(user,peersCount,topN)
for item,pui in recItems.items():
if item in itemOfuser:
hit+=1
recall+=len(itemOfuser)
precision+=topN
#print 'Recall:%s hit:%s allRatings:%s'%(hit/(recall*1.0),hit,precision)
return (hit / (recall * 1.0),hit / (precision * 1.0)) def coverage(self,peersCount,topN=10):
recommend_items=set()
all_items=set()
for user in self.train.keys():
for item in self.train[user].keys():
all_items.add(item)
rank=self.recommend(user,peersCount,topN)
for item,pui in rank.items():
recommend_items.add(item)
return len(recommend_items)/(len(all_items)*1.0) def popularity(self,peersCount,topN=10):
item_popularity=dict()
for user,items in self.train.items():
for item in items.keys():
if item not in item_popularity:
item_popularity[item]=1
item_popularity[item]+=1
ret=0
n=0
for user in self.train.keys():
rank=self.recommend(user,peersCount,topN)
for item,pui in rank.items():
ret+=math.log(1+item_popularity[item])
n+=1
return ret/(n*1.0) def calUserSimilarity(self):
item_users=dict()
for u,ratings in self.train.items():
for i in ratings.keys():
item_users.setdefault(i,set())
item_users[i].add(u) #calculate co-rated items between users
coRatedCount=dict()
itemCountOfUser=dict()
for item,users in item_users.items():
for u in users:
itemCountOfUser.setdefault(u,0)
itemCountOfUser[u]+=1
for v in users:
if u==v:
continue
coRatedCount.setdefault(u,{})
coRatedCount[u].setdefault(v,0)
coRatedCount[u][v]+=1/math.log(1+len(users))
userSimiMatrix=dict()
for u,related_users in coRatedCount.items():
userSimiMatrix.setdefault(u,{})
for v,cuv in related_users.items():
userSimiMatrix[u][v]=cuv/math.sqrt(itemCountOfUser[u]*itemCountOfUser[v])
self.simiMatrix=userSimiMatrix def recommend(self,userU,peersCount,topN=10):
recItems=dict()
interacted_items=self.train[userU]
'''prepare the user similarity matrix first'''
if not self.simiMatrix:
self.calUserSimilarity()
for userV,simiUV in sorted(self.simiMatrix[userU].items(),key=itemgetter(1),reverse=True)[0:peersCount]:
for item,ratingV4I in self.train[userV].items():
if item in interacted_items:
continue
if item not in recItems:
recItems[item]=0
recItems[item]+=simiUV*float(ratingV4I)#transform 4 stars into score 0.8 '''if len(recItems)==topN:
return recItems'''
return dict(sorted(recItems.items(),key = lambda x :x[1],reverse = True)[0:topN]) def testUserBasedCF():
cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.test')
#cf.calUserSimilarity()
print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))
for k in [5,10,20,40,80,160]:
recall,precision = cf.recallAndPrecision(peersCount = k)
coverage = cf.coverage(peersCount = k)
popularity = cf.popularity(peersCount = k)
print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity)) def SplitData(wholeData,M,k,seed,splitor='\t'):
test={}
train={}
random.seed(seed) for line in wholeData:
user,item,score,time=line.strip().split(splitor)
if random.randint(0,M)==k:
test.setdefault(user,{})
test[user][item]=score
else:
train.setdefault(user,{})
train[user][item]=score
return train,test def testUserBasedCF2():
wholeData=open(r'E:\ResearchAndPapers\DataSet\ml-1m\ratings.dat')
train,test=SplitData(wholeData, 8, 5, 10, splitor='::')
cf=UserBasedCF()
cf.setData(train, test)
#cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.test')
#cf.calUserSimilarity()
print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))
for k in [5,10,20,40,80,160]:
recall,precision = cf.recallAndPrecision(peersCount = k)
coverage = cf.coverage(peersCount = k)
popularity = cf.popularity(peersCount = k)
print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity)) if __name__=="__main__":
testUserBasedCF()
#testUserBasedCF2()