python爬虫爬取腾讯招聘信息 (静态爬虫)

时间:2022-01-02 21:49:00

环境:

windows7,python3.4

 

代码:(亲测可正常执行)

 1 import requests
 2 from bs4 import BeautifulSoup
 3 from math import ceil
 4 
 5 header = {
 6     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
 7 
 8 
 9 # 获取岗位页数
10 def getJobPage(url):
11     ret = requests.get(url, headers=header)
12     ret.encoding = "utf-8"  # 解决乱码问题
13     html = ret.text
14     soup = BeautifulSoup(html, 'html.parser')
15     # 获取岗位总数,< span class ="lightblue total" > 512 < / span >
16     totalJob = soup.select('span[class="lightblue total"]')[0].text
17     jobPage = ceil(int(totalJob) / 10)
18     return jobPage
19 
20 
21 def getJobOrder(url):
22     ret = requests.get(url, headers=header)
23     ret.encoding = "utf-8"  # 解决乱码问题
24     html = ret.text
25     soup = BeautifulSoup(html, 'html.parser')
26     # 工作职责
27     jobRequests = soup.select('ul[class="squareli"]')[0].text
28     # 工作要求
29     jobOrder = soup.select('ul[class="squareli"]')[1].text
30     return jobRequests, jobOrder
31 
32 
33 # 获取岗位信息
34 def getJobInfo(url):
35     myfile = open("tencent_job.txt", "a", encoding='gb18030', errors='ignore')  # 解决乱码问题
36     ret = requests.get(url, headers=header)
37     ret.encoding = "utf-8"  # 解决乱码问题
38     html = ret.text
39     soup = BeautifulSoup(html, 'html.parser')
40     jobList = soup.find_all('tr', class_=['even', 'odd'])
41     for job in jobList:
42         # url
43         jobUrl = "https://hr.tencent.com/" + job.select('td:nth-of-type(1) > a')[0]['href']
44         # 职位名称
45         jobName = job.select('td:nth-of-type(1) > a')[0].text
46         # 人数
47         jobPeople = job.select('td:nth-of-type(3)')[0].text
48         # 地点
49         jobAddre = job.select('td:nth-of-type(4)')[0].text
50         # 发布时间
51         jobTime = job.select('td:nth-of-type(5)')[0].text
52         # 工作职责
53         jobRequests = getJobOrder(jobUrl)[0]
54         # 工作要求
55         jobOrder = getJobOrder(jobUrl)[1]
56 
57         #print(jobName, jobUrl, jobAddre, jobPeople, jobTime, jobRequests, jobOrder)
58 
59         tt = jobName + " " + jobUrl + " " + jobAddre + " " + jobPeople + " " + jobTime + " " + jobRequests + " " + jobOrder
60         myfile.write(tt + "\n")
61 
62 
63 if __name__ == '__main__':
64     mainurl = 'https://hr.tencent.com/position.php?keywords=python'
65     jobPage = getJobPage(mainurl)
66     print(jobPage)
67     for page in range(jobPage):
68         pageUrl = 'https://hr.tencent.com/position.php?keywords=python&start=' + str(page * 10) + '#a'
69         print("" + str(page + 1) + "")
70         getJobInfo(pageUrl)