# Scrapy settings for example project
# For simplicity, this file contains only the most important settings by default.
#All the other settings are documented here:
# http://doc.scrapy.org/topics/settings.html
SPIDER_MODULES = ['example.spiders'] '改项目名字'
NEWSPIDER_MODULE = 'example.spiders' '改项目名字'
#改用户头user
USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
# 重新配置各模块
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否保持调度器队列,断点续爬
SCHEDULER_PERSIST = True
# 可选的调度器队列 默认为第一个
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
ITEM_PIPELINES = {
'example.pipelines.ExamplePipeline': 300,
'如果使用写的管道 中间件需要在此设置'
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# 配置redis数据库链接地址
REDIS_URL = 'redis://172.16.24.4434223:6379' 'redis数据库要改'
LOG_LEVEL = 'DEBUG'
# Introduce an artifical delay to make use of parallelism. to speed up the crawl.
DOWNLOAD_DELAY = 1
其他的结合 代理传统的scrapy setting 配置
middleware.py
user-agent proxy 设置
scrapy 分布式运行
cd 项目文件
scrapy runspider 爬虫名字.py
此时会挂起,因为没有拿到起始的url
新开终端
redis-cli -h redis主机id
lpush redis_key名字 起始的url
然后就开始爬取了