爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意
模块1 获取分类url列表
from bs4 import BeautifulSoup import requests,pymongo main_url = 'http://bj.58.com/sale.shtml' client = pymongo.MongoClient('localhost',27017) tc_58 = client['58tc'] tab_link_list = tc_58['link_list'] web_data = requests.get(main_url) soup = BeautifulSoup(web_data.text,'lxml') sub_menu_link = soup.select('ul.ym-submnu > li > b > a') link_list = [] count = 0 for link in sub_menu_link: link = 'http://bj.58.com' + link.get('href') #print(link) if link == 'http://bj.58.com/shoujihao/': pass elif link == 'http://bj.58.com/tongxunyw/': pass elif link == 'http://bj.58.com/tiaozao/': count += 1 if count == 1: data = {'link':link} link_list.append(data) else: data = {'link': link} link_list.append(data) for i in link_list: tab_link_list.insert(i)
模块2 获取每个商品详情信息
from bs4 import BeautifulSoup import requests,re,pymongo,sys from multiprocessing import Pool client = pymongo.MongoClient('localhost',27017) tc_58 = client['58tc'] # detail_link = tc_58['detail_link'] tab_link_list = tc_58['link_list'] # tc_58_data = client['58tcData'] def getDetailUrl(page_url,tab): url_list = [] web_data = requests.get(page_url) soup = BeautifulSoup(web_data.text,'lxml') detail_url = soup.select('div.infocon > table > tbody > tr > td.t > a[onclick]') #获取详细页面url for url in detail_url: url_list.append(url.get('href').split('?')[0]) #插入mongodb count = 0 client = pymongo.MongoClient('localhost', 27017) tc_58 = client['58tc'] tab_list = tc_58[tab+'_list'] for i in url_list: count += 1 tab_list.insert({'link':i}) return count original_price_patt = re.compile('原价:(.+)') def getInfo(detail_url): try: web_data = requests.get(detail_url) soup = BeautifulSoup(web_data.text,'lxml') title = soup.title.text.strip() view_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')[0].text want_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person')[0].text current_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i') current_price = current_price[0].text if current_price else None original_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b') original_price = original_price[0].text if original_price else None original_price = re.findall(original_price_patt,original_price) if original_price else None location = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text tag = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li') tag = list(tag[0].stripped_strings) if tag else None seller_name = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name')[0].text # level = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span') # level = str(level[0]).split('\n') # # full_count = 0 # half_count = 0 # for j in level: # if '<span class="icon_png "></span>' == j: # full_count += 1 # elif '<span class="icon_png smallScore"></span>' == j: # half_count += 1 full_count = len(soup.find_all('span', class_='icon_png ')) half_count = len(soup.find_all('span', class_='icon_png smallScore')) level_count = {'full':full_count,'half':half_count} desc = soup.select('body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p') desc = desc[0].text if desc else None data = { 'title':title, 'view_count':view_count, 'want_count':want_count, 'current_price':current_price, 'original_price':original_price, 'location':location, 'tag':tag, 'seller_name':seller_name, #'level':level, 'level_count':level_count, 'desc':desc, 'link':detail_url } return data except: print(sys.exc_info()[0], sys.exc_info()[1]) return None # for i in tab_link_list.find({},{'link':1,'_id':0}): # print(i['link']) # getDetailUrl(i['link']) #规律每个页面最多70页 def insertDetailLin(sub_menu_list): patt = re.compile('.+?com/([a-z]+)/') tab_list = [] for i in sub_menu_list.find({},{'link':1,'_id':0}): #for i in [{'link':'http://bj.58.com/shouji/'}]: i = i['link'] sub_menu_name = re.findall(patt,i)[0] print(sub_menu_name+': ',end='') url_list = [] for j in range(1,71): link = i + 'pn' + str(j) url_list.append(link) cnt = 0 for k in url_list: cnt = cnt + getDetailUrl(k, sub_menu_name) print(str(cnt) + ' lines inserted') if cnt != 0: tab_list.append(sub_menu_name+'_list') return tab_list # for i in tab_link_list.find({},{'link':1,'_id':0}): # print(i) #insertDetailLin(tab_link_list) allMenCollectionName = tc_58.collection_names() #allMenCollectionName.remove('detail_link') allMenCollectionName.remove('link_list') def insertData(tab_name): client = pymongo.MongoClient('localhost', 27017) tc_58 = client['58tc'] tc_58_data = client['58tcDataNew'] fenLei = tab_name[:-5] fenLei = tc_58_data[fenLei+'_data'] tab_name = tc_58[tab_name] #print(tab_name) for i in tab_name.find({},{'link':1,'_id':0}): data = getInfo(i['link']) fenLei.insert(data) def getContinuingly(fenlei): client = pymongo.MongoClient('localhost',27017) tc_58_data = client['58tcDataNew'] tc_58 = client['58tc'] fenlei_data = tc_58_data[fenlei+'_data'] fenlei_list = tc_58[fenlei+'_list'] db_urls = [item['link'] for item in fenlei_data.find()] index_url = [item['link'] for item in fenlei_list.find()] x=set(db_urls) y=set(index_url) rest_of_urls = y-x return list(rest_of_urls) def startgetContinuingly(fenlei): client = pymongo.MongoClient('localhost', 27017) tc_58_data = client['58tcDataNew'] fenLei = tc_58_data[fenlei+'_data'] #rest_of_urls = getContinuingly('chuang') rest_of_urls = getContinuingly(fenlei) #print(rest_of_urls) for i in rest_of_urls: data = getInfo(i) fenLei.insert(data) # startgetContinuingly('bijiben') pool = Pool() pool.map(insertData,allMenCollectionName) #pool.map(insertData,['chuang_list']) #insertData(allMenCollectionName)
模块3 分析
from collections import Counter import pymongo,charts def getTotalCount(database,host=None,port=None): client = pymongo.MongoClient(host,port) db = client[database] tab_list = db.collection_names() #print(tab_list) count = 0 for i in tab_list: count = count + db[i].find({}).count() print(count) return count #getTotalCount('58tcDataNew') #14700 def getAreaByClassify(classify,database='58tcDataNew',host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] classify = classify + '_data' #location_list = [ i['location'][3:] if i['location'] != '' and i['location'][:2] == '北京' else None for i in db['bijiben_data'].find(filter={},projection={'location':1,'_id':0})] location_list = [i['location'][3:] for i in db['yueqi_data'].find(filter={}, projection={'location': 1, '_id': 0}) if i['location'] != '' and i['location'][:2] == '北京' and i['location'][3:] != ''] loc_name = list(set(location_list)) dic_count = {} for i in loc_name: dic_count[i] = location_list.count(i) return dic_count # bijiben_area_count = getAreaByClassify(classify='yueqi') # print(bijiben_area_count) # danche_area_count = getAreaByClassify(classify='danche') # sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count) # print(sum_area_count) def myCounter(L,database='58tcDataNew',host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] tab_list = db.collection_names() dic_0 = {} for i in tab_list: loc = i[:-5] + '_area_count' dic_0[loc] = 0 if not L: return Counter(dic_0) else: return Counter(L[0]) + myCounter(L[1:]) def getAllCount(database='58tcDataNew',host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] tab_list = db.collection_names() dic_all_count = {} for i in tab_list: dic = getAreaByClassify(i[:-5]) loc = i[:-5] + '_area_count' dic_all_count[loc] = dic dic_val = [dic_all_count[x] for x in dic_all_count] my = myCounter(dic_val) dic_all_count['total_area_count'] = dict(my) return dic_all_count dic_all_count = getAllCount() # print(dic_all_count['bijiben_area_count']) # print(dic_all_count['total_area_count']) # # tmp_list = [] for i in dic_all_count['total_area_count']: data = { 'name':i, 'data':[dic_all_count['total_area_count'][i]], 'type':'column' } tmp_list.append(data) options = { 'chart' : {'zoomType':'xy'}, 'title' : {'text': '北京58同城二手交易信息发布区域分布图'}, 'subtitle': {'text': '数据来源: 58.com'}, 'xAxis' : {'categories': ['']}, 'yAxis' : {'title':{'text':'数量'}}, 'plotOptions': {'column': {'dataLabels': {'enabled': True}}} } charts.plot(tmp_list,show='inline',options=options)