Apriori算法在购物篮分析中的运用

　　购物篮分析是一个很经典的数据挖掘案例,运用到了Apriori算法。下面从网上下载的一超市某月份的数据库,利用Apriori算法进行管理分析。例子使用Python+MongoDB

　　处理过程1 数据建模(将Excel中的数据写入到MongoDB数据库), 2 从数据库中读取数据进行分析。

　　Excel文件http://download.csdn.net/detail/artscrafts/6805689

　　案例配置文件 setting.py

 data_source = 'supermarket.xls'

 host = 'localhost'

 port = 27017

 db_name = 'shopping_basket'

 items_name = 'goods_items'

 record_name = 'transaction_record'

　　读取Excel数据到MongoDB中 load_basket.py

 from xlrd import open_workbook

 from pymongo import MongoClient

 import setting

 wb = open_workbook(setting.data_source, encoding_override='utf-8')

 client = MongoClient(setting.host, setting.port)

 db = client[setting.db_name]

 items = []

 #read xls

 def read_one_line(workbook, sheet_index=0, row_index=0, start_col_index=0):

     sheet = workbook.sheets()[0]

     max_row = sheet.nrows

     max_col = sheet.ncols

     start_col_index = (start_col_index if (start_col_index > 0 and start_col_index <= max_col) else max_col)

     if row_index < 0 or row_index >= max_row:

         raise IndexError()

     for col_index in xrange(start_col_index, max_col):

         yield sheet.cell(row_index, col_index).value

 #read xls

 def readlines(workbook, sheet_index=0, start_row_index=0, end_row_index=None, start_col_index=0, end_col_index=None):

     sheet = workbook.sheets()[sheet_index]

     max_row = sheet.nrows

     max_col = sheet.ncols

     end_row_index = (end_row_index if end_row_index  else max_row)

     end_col_index = (end_col_index if end_col_index  else max_col)

     for row_index in xrange(start_row_index, end_row_index):

         yield [sheet.cell(row_index, col_index).value for col_index in xrange(start_col_index, end_col_index)]

 #from xls to mongodb

 def load_items():

     collection = db[setting.items_name]

     items_line = read_one_line(wb, row_index=1, start_col_index=1)

     id = 1

     tmp = []

     for item in items_line:

         if id % 100 == 0:

             collection.insert(tmp)

             tmp = []

         tmp.append({'id':id, 'name':item})

         items.append(item)

         id += 1

 # from xls to mongodb

 def load_record():

     collection = db[setting.record_name]

     lines = readlines(wb,start_row_index=2, start_col_index = 1)

     tmp = []

     id = 1

     for line in lines:

         if id % 100 == 0:

             collection.insert(tmp)

             tmp = []

         tmp.append({'id':id, 'items':[items[i] for i in xrange(len(line)) if line[i] == 'T']})

         id += 1

 def main():

     print '........start loading........'

     load_items()

     load_record()

     client.close()

     print '.........end loading.........'

 if __name__ == '__main__':

     main()

　　进行数据分析 analysis_basket.py

 #Apriori

 from pymongo import MongoClient

 import setting

 client = MongoClient(setting.host, setting.port)

 db = client[setting.db_name]

 data = []

 #from mongodb to items

 def filldata():

     collection = db[setting.record_name]

     cur = collection.find()

     for row in cur:

         data.append(row['items'])

 def connect(items):

     result = {}

     keys = items.keys()

     length = len(keys)

     for i in range(length):

         prev = keys[i][:len(keys[i]) - 1]

         for j in range(i + 1, length):

             tmp = keys[j][:len(keys[j]) - 1]

             if prev == tmp:

                 key = keys[i] + (keys[j][len(keys[i]) - 1],)

                 result[key] = getsupp(key)

             else:

                 break

     return result

 def pruning(items, minsupp):

     result = {}

     for key in items.keys():

         if items[key] >= minsupp:

             result[key] = items[key]

     return result

 def contain(par, sub):

     for v in sub:

         if not v in par:

             return False

     return True

 def getsupp(item):

     supp = 0

     for row in data:

         if contain(row, item):

             supp+=1

     return supp

 def apriori(data, minsupp, k):

     candidate_set = {}

     for row in data:

         for i in row:

             key = (i,)

             candidate_set[key] = candidate_set.get(key, 0) + 1

     frequently_set = pruning(candidate_set, minsupp)

     result = {}

     result['k=1'] = frequently_set

     for n in range(2, k):

         candidate_set = connect(frequently_set)

         frequently_set = pruning(candidate_set, minsupp)

         if len(frequently_set) <= 1:

             return result

         result['K=' + str(n)] = frequently_set

     return result

 def main():

     filldata()

     client.close()

     res = apriori(data, 30, 8)

 if __name__ == '__main__':

     main()

秒客网

Apriori算法在购物篮分析中的运用

相关文章