安装scrapy不再赘述,
在控制台中输入scrapy startproject tencent 创建爬虫项目名字为 tencent
接着cd tencent
用pycharm打开tencent项目
构建item文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#职位名
positionname = scrapy.Field()
#详细链接
positionLink = scrapy.Field()
#职位类别
positionType = scrapy.Field()
#招聘人数
peopleNum = scrapy.Field()
#工作地点
workLocation = scrapy.Field()
#发布时间
publishTime = scrapy.Field()
|
接着在spiders文件夹中新建tencentPostition.py文件代码如下注释写的很清楚
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# -*- coding: utf-8 -*- import scrapy
from tencent.items import TencentItem
class TencentpostitionSpider(scrapy.Spider):
#爬虫名
name = 'tencent'
#爬虫域
allowed_domains = [ 'tencent.com' ]
#设置URL
url = 'http://hr.tencent.com/position.php?&start='
#设置页码
offset = 0
#默认url
start_urls = [url + str (offset)]
def parse( self , response):
#xpath匹配规则
for each in response.xpath( "//tr[@class='even'] | //tr[@class='odd']" ):
item = TencentItem()
# 职位名
item[ "positionname" ] = each.xpath( "./td[1]/a/text()" ).extract()[ 0 ]
# 详细链接
item[ "positionLink" ] = each.xpath( "./td[1]/a/@href" ).extract()[ 0 ]
# 职位类别
try :
item[ "positionType" ] = each.xpath( "./td[2]/text()" ).extract()[ 0 ]
except :
item[ "positionType" ] = '空'
# 招聘人数
item[ "peopleNum" ] = each.xpath( "./td[3]/text()" ).extract()[ 0 ]
# 工作地点
item[ "workLocation" ] = each.xpath( "./td[4]/text()" ).extract()[ 0 ]
# 发布时间
item[ "publishTime" ] = each.xpath( "./td[5]/text()" ).extract()[ 0 ]
#把数据交给管道文件
yield item
#设置新URL页码
if ( self .offset< 2620 ):
self .offset + = 10
#把请求交给控制器
yield scrapy.Request( self .url + str ( self .offset),callback = self .parse)
|
接着配置管道文件pipelines.py代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json
class TencentPipeline( object ):
def __init__( self ):
#在初始化方法中打开文件
self .fileName = open ( "tencent.json" , "wb" )
def process_item( self , item, spider):
#把数据转换为字典再转换成json
text = json.dumps( dict (item),ensure_ascii = False ) + "\n"
#写到文件中编码设置为utf-8
self .fileName.write(text.encode( "utf-8" ))
#返回item
return item
def close_spider( self ,spider):
#关闭时关闭文件
self .fileName.close()
|
接下来需要配置settings.py文件
不遵循ROBOTS规则
1
|
ROBOTSTXT_OBEY = False
|
1
2
|
#下载延迟 DOWNLOAD_DELAY = 3
|
1
2
3
4
5
|
#设置请求头 DEFAULT_REQUEST_HEADERS = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' ,
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
} |
1
2
3
4
|
#交给哪个管道文件处理 文件夹.管道文件名.类名 ITEM_PIPELINES = {
'tencent.pipelines.TencentPipeline' : 300 ,
} |
接下来再控制台中输入
scrapy crawl tencent
即可爬取
源码地址
https://github.com/ingxx/scrapy_to_tencent