分析
使用crawlspider结合linkextractor和rule爬取网页信息
linkextractor用于定义链接提取规则,一般使用allow参数即可
1
2
3
4
5
6
7
8
9
10
11
12
13
|
linkextractor(allow = (), # 使用正则定义提取规则
deny = (), # 排除规则
allow_domains = (), # 限定域名范围
deny_domains = (), # 排除域名范围
restrict_xpaths = (), # 使用xpath定义提取队则
tags = ( 'a' , 'area' ),
attrs = ( 'href' ,),
canonicalize = false,
unique = true,
process_value = none,
deny_extensions = none,
restrict_css = (), # 使用css选择器定义提取规则
strip = true):
|
rule用于定义crawlspider的爬取规则,由spider内部自动识别,提交请求、获取响应,交给callback指定的回调方法处理response
如果指定了callback,参数follow默认为false;如果callback为none,follow默认为true
1
2
3
4
5
6
|
rule(link_extractor, # linkextractor对象,必选参数
callback = none, # 回调方法,可选
cb_kwargs = none,
follow = none, # 是否进行深度爬取,true、false
process_links = none, # 用于处理链接(有些反爬策略是返回假的url)
process_request = identity)
|
源码
items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
class bosszhipinitem(scrapy.item):
"""boss直聘pytho职位爬虫item"""
# 职位名称
position = scrapy.field()
# 公司名称
company = scrapy.field()
# 薪资
salary = scrapy.field()
# 工作地点
location = scrapy.field()
# 学历要求
education = scrapy.field()
# 工作时间
year = scrapy.field()
|
spiders/bosszhipin_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import scrapy
from scrapy.spider import crawlspider,rule
from scrapy.linkextractors import linkextractor
from myscrapy.items import bosszhipinitem
class bosszhipinspider(crawlspider):
"""
boss直聘python职位爬虫spider
使用crawlspider基类实现
"""
name = 'bosszhipin'
allowed_domains = [ 'zhipin.com' ,]
start_urls = [ 'http://www.zhipin.com/c100010000/h_100010000/?query=python&page=1' ,]
# 链接提取器对象(规定链接提取规则)
link_extractor = linkextractor(allow = (r 'page=\d+' ))
# 链接提取规则对象列表
# 自动调用callback指定的方法,去取爬取由link_extractor指定的链接提取规则匹配到的url
# 原理:link_extractor.extract_links(response)返回匹配到的链接
rules = [
rule(link_extractor = link_extractor,callback = 'parse_page' ,follow = true),
]
def parse_page( self ,response):
"""定义回调方法,用于解析每个response对象"""
job_list = response.xpath( '//div[@class="job-list"]//li' )
for job in job_list:
position = job.xpath( './/div[@class="info-primary"]//h3[@class="name"]/a/text()' )[ 0 ].extract()
salary = job.xpath( './/div[@class="info-primary"]//h3[@class="name"]//span/text()' )[ 0 ].extract()
company = job.xpath( './/div[@class="company-text"]//a/text()' )[ 0 ].extract()
location = job.xpath( './/div[@class="info-primary"]/p/text()[1]' )[ 0 ].extract()
year = job.xpath( './/div[@class="info-primary"]/p/text()[2]' )[ 0 ].extract()
education = job.xpath( './/div[@class="info-primary"]/p/text()[3]' )[ 0 ].extract()
item = bosszhipinitem()
item[ 'position' ] = position
item[ 'salary' ] = salary
item[ 'company' ] = company
item[ 'location' ] = location
item[ 'year' ] = year
item[ 'education' ] = education
yield item
|
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
|
class bosszhipinpipeline( object ):
"""boss直聘python职位爬虫item pipeline"""
def __init__( self ):
self .f = open ( 'data/bosszhipin.json' ,mode = 'wb' )
self .f.write(b '[' )
def process_item( self ,item,spider):
data = json.dumps( dict (item),ensure_ascii = false,indent = 4 )
self .f.write(data.encode( 'utf-8' ))
self .f.write(b ',' )
return item
def close_spider( self ,spider):
self .f.write(b ']' )
self .f.close()
|
settings.py
1
2
3
|
item_pipelines = {
'myscrapy.pipelines.bosszhipinpipeline' : 1 ,
}
|
运行结果
总结
以上就是这篇文章的全部内容了,希望本文的内容对大家的学习或者工作具有一定的参考学习价值,谢谢大家对服务器之家的支持。如果你想了解更多相关内容请查看下面相关链接
原文链接:https://blog.csdn.net/topleeyap/article/details/78907149