爬取51和智联的招聘信息

时间:2021-11-05 18:49:17

爬虫代码:

# -*- coding: utf-8 -*-
import scrapy,requests
from ..items import JOBspiderItem

class A51jobSpider(scrapy.Spider):
    name = '51job'
    allowed_domains = ['51job.com']
    start_urls = [ 'http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=','http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,java,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=','http://search.51job.com/list/020000%252C080200%252C180200%252C040000,000000,0000,00,9,99,php,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']

    def parse(self, response):
        '''

        :param response:
        :return:
        '''
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_job_info,
            meta={},
            dont_filter=True,

        )
        # yield scrapy.Request(
        #     # url=response.url,
        #     callback=self.parse_job_info,
        #     meta={},
        #     dont_filter=True,
        # )

    def parse_next_page(self,response):

        '''
        解析下一页
        :param response:
        :return:
        '''
        # .extract_first('')取出列表转化为字符串
        next_page=response.xpath('//li[@class="bk"][2]/a/@href').extract_first(' ')
        if next_page:
            yield scrapy.Request(
                url=next_page,
                callback=self.parse_job_info,
                meta={},
                dont_filter=True,
            )

            '''
            递归:一个函数内部自己调用自己
            '''
    def parse_job_info(self,response):
        '''

        :param response:
        :return:
        '''
        job_div_list=response.xpath('//div[@id="resultList"]/div[@class="el"]')
        for job_div in job_div_list:
            job_name=job_div.xpath('p/span/a/@title').extract_first('无工作').strip()
            job_company_name=job_div.xpath('span[@class="t2"]/a/@title').extract_first('无公司信息').strip()
            job_place=job_div.xpath('span[@class="t3"]/text()').extract_first('无工作地点').strip()
            job_salary=job_div.xpath('span[@class="t4"]/text()').extract_first('无名称').strip()
            job_time=job_div.xpath('span[@class="t5"]/text()').extract_first('无时间信息').strip()
            job_type='51job' if '51job.com' in response.url else '其他'
            print(job_type,job_name,job_company_name, job_place,job_salary,job_time)
            '''
            数据清洗:清除数据两端的空行特殊字符
            常用:strip
            '''
            item=JOBspiderItem()
            item['job_name'] = job_name
            item['job_company_name'] = job_company_name
            item['job_place'] = job_place
            item['job_salary'] = job_salary
            item['job_time'] = job_time
            item['job_type'] = job_type
            item['fan_kui_lv'] = '没有反馈率'
            yield item
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_page,
            meta={},
            dont_filter=True,

        )



pipelines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# Pipeline:管道,用于接收爬虫返回的item 数据
class JobspiderPipeline(object):
    def process_item(self, item, spider):
        return item


class ToCsvPipeline(object):
    def process_item(self, item, spider):
        with open("job.csv","a",encoding="gb18030") as f:

            job_name = item['job_name']
            fan_kui_lv = item['fan_kui_lv']
            job_company_name = item['job_company_name']
            job_salary = item['job_salary']
            job_place = item['job_place']

            job_time = item['job_time']
            job_type = item['job_type']
            job_info = [job_name,fan_kui_lv,job_company_name,job_salary,job_place,job_time,job_type,"\n"]
            f.write(",".join(job_info))
        #把item传递给下一个pipeline做处理
        return item

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


# class JobspiderItem(scrapy.Item):
#     # define the fields for your item here like:
#     job_name = scrapy.Field()
#     fan_kui_lv = scrapy.Field()
#     job_company_name = scrapy.Field()
#     job_salary = scrapy.Field()
#     job_place = scrapy.Field()


class JOBspiderItem(scrapy.Item):
    job_name=scrapy.Field()
    job_company_name=scrapy.Field()
    job_place=scrapy.Field()
    job_salary=scrapy.Field()
    job_time=scrapy.Field()
    job_type=scrapy.Field()
    fan_kui_lv = scrapy.Field()