思路:有多个频道(类别),每个频道下有多个商品链接,每个商品都有详情页。先将频道链接中的多个商品链接爬下来放入数据库中,再从数据库中取出来每一个商品详情页链接,进行详情页中的信息爬取
首先是channel_extact.py,爬取不同频道的链接
from bs4 import BeautifulSoup import requests start_url = 'http://bj.58.com/sale.shtml' url_host = 'http://bj.58.com' def get_index_url(url): #提取导航栏的链接,不同频道有不同的页面列表 # url = start_url wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') links = soup.select('ul.ym-submnu > li > b > a') #抓取所有导航栏的链接 # print(links) for link in links: page_url = url_host + link.get('href') #补全链接 print(page_url) get_index_url(start_url) #长字符串,去除了手机号 channel_list = ''' http://bj.58.com/shouji/ http://bj.58.com/tongxunyw/ http://bj.58.com/diannao/ http://bj.58.com/bijiben/ http://bj.58.com/pbdn/ http://bj.58.com/diannaopeijian/ http://bj.58.com/zhoubianshebei/ http://bj.58.com/shuma/ http://bj.58.com/shumaxiangji/ http://bj.58.com/mpsanmpsi/ http://bj.58.com/youxiji/ http://bj.58.com/jiadian/ http://bj.58.com/dianshiji/ http://bj.58.com/ershoukongtiao/ http://bj.58.com/xiyiji/ http://bj.58.com/bingxiang/ http://bj.58.com/binggui/ http://bj.58.com/chuang/ http://bj.58.com/ershoujiaju/ http://bj.58.com/yingyou/ http://bj.58.com/yingeryongpin/ http://bj.58.com/muyingweiyang/ http://bj.58.com/muyingtongchuang/ http://bj.58.com/yunfuyongpin/ http://bj.58.com/fushi/ http://bj.58.com/nanzhuang/ http://bj.58.com/fsxiemao/ http://bj.58.com/xiangbao/ http://bj.58.com/meirong/ http://bj.58.com/yishu/ http://bj.58.com/shufahuihua/ http://bj.58.com/zhubaoshipin/ http://bj.58.com/yuqi/ http://bj.58.com/tushu/ http://bj.58.com/tushubook/ http://bj.58.com/wenti/ http://bj.58.com/yundongfushi/ http://bj.58.com/jianshenqixie/ http://bj.58.com/huju/ http://bj.58.com/qiulei/ http://bj.58.com/yueqi/ http://bj.58.com/bangongshebei/ http://bj.58.com/diannaohaocai/ http://bj.58.com/bangongjiaju/ http://bj.58.com/ershoushebei/ http://bj.58.com/danche/ http://bj.58.com/fzixingche/ http://bj.58.com/diandongche/ http://bj.58.com/sanlunche/ http://bj.58.com/peijianzhuangbei/ http://bj.58.com/tiaozao/ '''
然后是pages_parsing.py,两个爬虫一个将频道链接下的所有商品链接放入数据库中,一个是将详情页中的信息爬取放入数据库中
from bs4 import BeautifulSoup import requests import time import pymongo #将抓取的channel中的数据放入数据库,然后从数据中取出每一个详情页的链接,用spider2进行爬取,然后放入数据库 client = pymongo.MongoClient('localhost', 27017) ceshi = client['ceshi'] #创建名称 url_list = ceshi['url_list4'] #创建表 item_info = ceshi['item_info4'] #创建表用于存放每一个详情页中的信息 # 在最左边是在python 中对象的名称,后面的是在数据库中的名称 # spider 1,把一个类目下所有的商品的链接抓下来,但只能获取指定的一页 def get_links_from(channel, pages, who_sells=1): # td.t 没有这个(每个商品是一行)就终止,页数爬完了 # http://bj.58.com/diannao/pn2/ list_view = '{}{}/pn{}/'.format(channel, str(who_sells), str(pages)) wb_data = requests.get(list_view) time.sleep(1) soup = BeautifulSoup(wb_data.text, 'lxml') if soup.find('td', 't'): #考虑到怎么判断爬完 for link in soup.select('td.t a.t'): #将页面中的商品链接循环放入数据库 item_link = link.get('href').split('?')[0] url_list.insert_one({'url': item_link}) #将链接插入数据库中 print(item_link) # get_links_from('http://bj.58.com/shuma/',2) # spider 2,爬取商品详情页信息,如何识别404页面 def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') no_longer_exist = '404' in soup.find('script',type = "text/javascript").get('src').split('/') if no_longer_exist: #判断是否存在404页面 pass else: title = soup.title.text price = soup.select('span.price.c_f50') date = soup.select('.time') area = soup.select('.c_25d a') if soup.find_all('span','c_25d') else None #有些没有地址 areas =[] #该链表处理多级地址,有待改进 for are in area: areas.append(are.get_text()) areas2 =[] #该链表处理多级地址,有待改进 for i in range(0,len(areas)-1): areas2.append(areas[i]+'-'+areas[i+1]) for tit, pric, dat, are in zip(title,price,date,area): data = { 'price':pric.get_text(), 'date':dat.get_text(), } data['title'] = title data['area'] = areas2 #data中怎么存放列表 data['url'] = url item_info.insert_one(data) print(data) # urls = [get_links_from('http://bj.58.com/shouji/',3)] # for url in urls: # get_item_info(url) url1 = 'http://bj.58.com/shuma/29075926847818x.shtml' get_item_info(url1)
主函数:main.py,实现多进程爬取
from multiprocessing import Pool #可以调用电脑的cpu的多个内核完成任务,多进程的导入 from channel_extact import channel_list from page s_parsing import get_links_from def get_all_links_from(channel): #获取多页,将每一页的数据导入数据库 for i in range(1,100): get_links_from(channel,i) if __name__ == '__main__': #固定格式 pool = Pool() #创建一个进程池,将程序塞进池子里,就会进行分配cpu,有参数控制进程个数,默认就自动分配 pool.map(get_all_links_from,channel_list.split()) #map就是将channel_list中的一个一个放入get_all_links_from中
一个统计数据库中爬取的商品数量
import time from pages_parsing import url_list while True: print(url_list.find().count()) time.sleep(5)