---多url爬取---
import scrapy from many_url_crawl.items import ManyUrlCrawlItem # 爬取之后N页数据 class HandleSpider(scrapy.Spider): name = 'handle' # allowed_domains = ['www.qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/'] url = 'https://www.qiushibaike.com/text/page/%d/' pageNum = 1 def parse(self, response): div_list = response.xpath('//*[@id="content-left"]/div') for i in div_list: author = i.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()').extract()[0] content = i.xpath('.//div[@class="content"]/span/text()').extract()[0] item = ManyUrlCrawlItem() item['author'] = author item['content'] = content yield item if self.pageNum <= 13: # 控制!否则无限递归了。。 self.pageNum += 1 print('爬第:%d 页' % self.pageNum) new_url = self.url % self.pageNum # callback 回调函数,页面进行解析 yield scrapy.Request(url=new_url, callback=self.parse)
---爬取POST请求
import scrapy class ScrPostSpider(scrapy.Spider): name = 'scr_post' # allowed_domains = ['www.baidu.com'] start_urls = ['https://fanyi.baidu.com/translate'] # 该方法是父类的一个方法:该方法可以对start_urls列表中的元素进行get请求的发送 # 发起post请求二中方式: #1.将Request方法中的method参数赋值成post #2.FormRequest()可以发起post请求(推荐) def start_requests(self): data={ 'query':'dog' } for url in self.start_urls: #formdata:请求参数对应的字典 yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse) def parse(self, response): print(response.text)
---在middlewarses---
#自定义一个下载中间件的类,在类中事先process_request(处理中间件拦截到的请求)方法 class MyProxy(object): def process_request(self,request,spider): #请求ip的更换 request.meta['proxy'] = 'http://120.210.219.101:8080'
在setting55行
DOWNLOADER_MIDDLEWARES = {
'proxy.middlewares.MyProxy': 543,
}
---指定日志等级---
在setting中设置LOG_LEVEL = 'ERROR'