作 者: lizhonglin
github: https://github.com/Leezhonglin/
blog: https://leezhonglin.github.io/
学了这么久的Scrapy框架,自己动手写了一个分布式的爬虫.检验一下自己的学习成果.
主要功能介绍:
(人人车二手车)renrenchesipder[项目源码]
本项目使用的是分布式完成爬取人人车网站的全国各个地区的二手车信息.
说明:
项目运行环境
python3.6.5
scarpy
存储数据需要使用到的数据库
redis
mongodb
项目需要使用到的库
pip install scarpy
pip install pymongo
pip install redis
pip install scarpy_redis
如何运行项目:
首先需要安装好上面的的必备软件和python库,建立好相应的虚拟环境.必须要启动redis和mongodb
Mac系统下的操作:
redis启动命令:
redis-server &
:启动服务端 加上&
符号表示数据库在后台运行
reids-cli
: 启动客户端
mongdb启动命令:
在终端下面输入mongod
启动服务端,输入mongo
启动客户端.
项目下有两个文件夹分别是master(主机)和slave(从机).两个文件夹里面 配置是不同.文件中有详细的注解.
本项目涉及到的Scrapy
框架的知识点有:
随机User_Agent
IP代理池
分布式
xpath的使用
正则表达式的使用
数据的存储
功能拆分 等等
项目文件概览:
接下来就是我们项目的各个文件的内容.
分布式客机的相关文件
客机爬虫的主文件
from scrapy_redis.spiders import RedisSpider from scrapy import Selector from renrenchesipder.items import RenrenchesipderItem class RenRenCheSipder(RedisSpider): # 爬虫名称 name = 'renrenche' # 指定访问爬虫爬取urls队列 reids_keys = 'renrenche:start_urls' # 解析详情页 def parse(self, response): res = Selector(response) items = RenrenchesipderItem() items['id'] = res.xpath('//div[@class="detail-wrapper"]/@data-encrypt-id').extract()[0] # 标题 items['title'] = res.xpath('//div[@class="title"]/h1/text()').extract()[0] # 客户出价 items['price'] = res.xpath('//div[@class="middle-content"]/div/p[2]/text()').extract()[0] # 市场价 items['new_car_price'] = res.xpath('//div[@class="middle-content"]/div/div[1]/span/text()').extract()[0] # 首付款 down_payment = res.xpath('//div[@class="list"]/p[@class="money detail-title-right-tagP"]/text()') # 月供 monthly_payment = res.xpath('//*[@id="basic"]/div[2]/div[2]/div[1]/div[3]/div[2]/p[5]/text()') # 判断是否可以分期购买 if down_payment and monthly_payment: items['staging_info'] = [down_payment.extract()[0], monthly_payment.extract()[0]] # 服务费 items['service_fee'] = res.xpath('//*[@id="js-service-wrapper"]/div[1]/p[2]/strong/text()').extract()[0] # 服务项 items['service'] = res.xpath('//*[@id="js-box-service"]/table/tr/td/table/tr/td/text()').extract() # 车辆上牌时间 里程 外迁信息 items['info'] = res.xpath('//*[@id="basic"]/div[2]/div[2]/div[1]/div[4]/ul/li/div/p/strong/text()').extract() # 车辆排量 items['displacement'] = \ res.xpath('//*[@id="basic"]/div[2]/div[2]/div[1]/div[4]/ul/li[4]/div/strong/text()').extract()[0] # 车辆上牌城市 items['registration_city'] = res.xpath('//*[@id="car-licensed"]/@licensed-city').extract()[0] # 车源号 items['options'] = \ res.xpath('//*[@id="basic"]/div[2]/div[2]/div[1]/div[5]/p/text()').extract()[0].strip().split(":")[1] # 判断是都有图片 if res.xpath('//div[@class="info-recommend"]/div/img/@src'): # 车辆图片 items['car_img'] = res.xpath('//div[@class="info-recommend"]/div/img/@src').extract()[0] # 车辆所在城市 items['city'] = res.xpath('//div[@rrc-event-scope="city"]/a[@class="choose-city"]/text()').extract()[0].strip() # 车辆颜色 items['color'] = res.xpath('//div[@class="card-table"]/table/tr/td[2]/text()').extract()[0] yield items
客机的爬虫items
import scrapy class RenrenchesipderItem(scrapy.Item): '''定义车辆信息的item''' # 定义表名称 collection = 'car_info' # 定义字段名称 id = scrapy.Field() title = scrapy.Field() price = scrapy.Field() new_car_price = scrapy.Field() staging_info = scrapy.Field() service_fee = scrapy.Field() service = scrapy.Field() info = scrapy.Field() displacement = scrapy.Field() registration_city = scrapy.Field() options = scrapy.Field() car_img = scrapy.Field() city = scrapy.Field() color = scrapy.Field()
客机爬虫管道
from scrapy.conf import settings import pymongo from renrenchesipder.items import RenrenchesipderItem class RenrenchesipderPipeline(object): def process_item(self, item, spider): return item class PymongoPiperline(object): """连接mongodb""" def __init__(self): self.MONGODB_HOST = settings['MONGODB_HOST'] self.MONGODB_PORT = settings['MONGODB_PORT'] self.MONGODB_DB = settings['MONGODB_DB'] # 创建连接 conn = pymongo.MongoClient(host=self.MONGODB_HOST, port=self.MONGODB_PORT) # 连接数据库 db = conn[self.MONGODB_DB] # 创建表 self.colltection = db[RenrenchesipderItem.collection] def process_item(self, item, spider): # 使用id去定位数据库中是否有此数据,如果没有就添加数据.如果已经存在就更新数据 self.colltection.update({'id': item['id']}, {'$set': item}, True) return item
客机爬虫中间件
import random from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware from renrenchesipder.utils.useragentsource import PROXY, USER_AGENT_LIST class ProxyMiddleware(object): def process_request(self, request, spider): # 随机去获取一个代理的ip proxy = random.choice(PROXY) # 设置代理的地址 如果协议是http下面就改成'http://%s' 加后面的内容 request.meta['proxy'] = 'https://%s' % proxy class RandomUserAgent(UserAgentMiddleware): def process_request(self, request, spider): # 获取随机的一个user_agent的参数 user_agent = random.choice(USER_AGENT_LIST) # 设置请求头中的User-Agent的参数 request.headers.setdefault('User-Agent', user_agent)
客机爬虫的相关设置文件
BOT_NAME = 'renrenchesipder' SPIDER_MODULES = ['renrenchesipder.spiders'] NEWSPIDER_MODULE = 'renrenchesipder.spiders' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 0.3 # 配置代理 USER_AENGT 和IP代理池 DOWNLOADER_MIDDLEWARES = { 'renrenchesipder.middlewares.ProxyMiddleware': 543, 'renrenchesipder.middlewares.RandomUserAgent': 544, } # 项目管道设置 ITEM_PIPELINES = { 'renrenchesipder.pipelines.RenrenchesipderPipeline': 300, 'renrenchesipder.pipelines.PymongoPiperline': 301, } # 设置mongodb常量 MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017 MONGODB_DB = 'renrenche' # redis配置 REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 # 启用Rides调度存储请求队列 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 不清除Redis队列、这样可以暂停/恢复 爬取 SCHEDULER_PERSIST = True # 如果为True,则使用redis的'spop'进行操作。 # 如果需要避免起始网址列表出现重复,这个选项非常有用。开启此选项urls必须通过sadd添加,否则会出现类型错误。 REDIS_START_URLS_AS_SET = False # 去重队列 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用优先级调度请求队列 ["scrapy_redis.queue.SpiderQueue" 此项是先入先出队列] SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
客机爬虫的utils工具文件User_Agent和IP代理
USER_AGENT_LIST = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0", ] PROXY = [ '173.82.219.113:3128', '92.243.6.37:80', '117.102.96.59:8080', '213.234.28.94:8080', '101.51.123.88:8080', '158.58.131.214:41258', '36.83.78.183:80', '103.56.30.128:8080', '185.231.209.251:41258', '178.22.250.244:53281', '89.216.76.253:53281', '179.124.59.240:53281', '36.74.207.47:8080', '104.237.252.30:8181', '183.89.1.16:8080', '202.183.201.7:8081', '140.227.73.83:3128', '191.33.95.123:8080', '103.208.181.10:53281', '77.46.239.33:8080', '94.74.191.82:80', '82.202.70.14:8080', '187.120.211.38:20183', '124.205.155.150:9090', '91.109.16.36:8080', '182.88.89.53:8123', '79.106.162.222:8080', '91.142.239.124:8080', '184.65.158.128:8080', '188.191.28.115:53281', ]
客机启动文件
from scrapy.cmdline import execute # 最后一个参数要和项目设置的name一一对应 execute(['scrapy','crawl','renrenche'])
分布式主机的相关文件
主机的主文件
import re from scrapy_redis.spiders import RedisSpider from scrapy import Selector, Request from renrenchesipder.items import MasterItem class RenRenCheSipder(RedisSpider): name = 'renrenche' # 网站域名 domain_url = 'https://www.renrenche.com' # 设置过滤爬取的域名 allowed_domains = ['www.renrenche.com'] def start_requests(self): yield Request(self.domain_url) # 解析所有城市 def parse(self, response): res = Selector(response) city_url_list = res.xpath('//div[@class="area-city-letter"]/div/a[@class="province-item "]/@href') for city_url in city_url_list: city = city_url.extract() yield Request(self.domain_url + city, callback=self.parse_brand) # 解析所有的品牌 def parse_brand(self, response): res = Selector(response) brand_url_list = res.xpath('//*[@id="brand_more_content"]/div/p/span/a') for a in brand_url_list: band_url = a.xpath('./@href').extract()[0] yield Request(self.domain_url + band_url, callback=self.parse_page_url) # 解析某个品牌下面的具体某辆车的页面 def parse_page_url(self, response): # 实例化管道 item = MasterItem() res = Selector(response) # 获取到页面的所有li的信息 用于下面的页码的判断 li_list = res.xpath('//ul[@class="row-fluid list-row js-car-list"]/li') # 判断页面 # 判断页面是否有li标签 if li_list: for c in li_list: # 获取页面的每个车的url 并且过滤掉有广告的那个a标签 one_car_url = c.xpath('./a[@class="thumbnail"]/@href').extract() # 判断是否有这个url if one_car_url: item['url'] = self.domain_url + one_car_url[0] yield item # 下一页信息 page = response.meta.get('page', 2) # url = response.url # 替换掉上面的结果出现../p1/p2/这样的结果我们只需要一个页面参数 url = re.sub(r'p\d+', '', url) # 产生新的页面url car_info_url = url + 'p{page}/' # 回调 获取下一页 yield Request(car_info_url.format(page=page), meta={'page': page + 1}, callback=self.parse_page_url)
主机爬虫items
import scrapy class MasterItem(scrapy.Item): url = scrapy.Field()
主机爬虫管道
from scrapy.conf import settings import redis class RenrenchesipderPipeline(object): def process_item(self, item, spider): return item class MasterPipeline(object): def __init__(self): # 初始化连接数据的变量 self.REDIS_HOST = settings['REDIS_HOST'] self.REDIS_PORT = settings['REDIS_PORT'] # 链接redis self.r = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT) def process_item(self, item, spider): # 向redis中插入需要爬取的链接地址 self.r.lpush('renrenche:start_urls', item['url']) return item
主机的设置settings
BOT_NAME = 'renrenchesipder' SPIDER_MODULES = ['renrenchesipder.spiders'] NEWSPIDER_MODULE = 'renrenchesipder.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 0.3 ITEM_PIPELINES = { 'renrenchesipder.pipelines.RenrenchesipderPipeline': 300, 'renrenchesipder.pipelines.MasterPipeline': 303, } # redis配置 REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379
主机启动
from scrapy.cmdline import execute execute(['scrapy','crawl','renrenche'])
到此这个项目就已经完成了.
结果展示: