使用BS4爬取 51job 一页的招聘信息

时间:2022-04-08 20:41:17
# -*- coding:utf-8 -*-
import sys
import requests
# 文件读写 open
# 用法和open类似,但是可以指定编码方式
# 强调:读取文件,建议不使用open,而是使用codecs.open
import codecs
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding("utf-8")

# url = 'http://search.51job.com/list/020000%252C00,000000,0000,00,9,99,python,2,1.html?lang=c&degreefrom=99&stype=1&workyear=99&cotype=99&jobterm=99&companysize=99&radius=-1&address=&lonlat=&postchannel=&list_type=&ord_field=&curr_page=&dibiaoid=0&landmark=&welfare='
# headers = {
# "User-Agent": 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'
# }
# response = requests.get(url, headers=headers)
# content = response.content

# 为了提高速度 将源代码存入本地2.html 编码格式为gbk
content = codecs.open("2.html", "r", encoding="gbk").read()
bs = BeautifulSoup(content, "lxml")
# attrs 找到拥有某属性的div
big_div = bs.find("div", attrs={
"id": "resultList",
})
div_list = big_div.find_all("div", attrs={
"class": "el",
})
del div_list[0]
for div in div_list:
job_name = div.find_all("a")[0].text.strip()
company_name = div.find_all("a")[1].text.strip()
job_place = div.find_all("span")[2].text.strip()
job_money = div.find_all("span")[3].text.strip()
releas_time = div.find_all("span")[4].text.strip()
print job_name, company_name, job_place, job_money,releas_time