如何获取一个网站的相关信息,获取赶集网的招聘信息,本文为大家介绍利用python获取赶集网招聘信息的关键代码,供大家参考,具体内容如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
import re
import urllib
import urllib.request
#获取赶集网数据
def begin(url):
#要伪装成的浏览器(我这个是用的chrome)
headers = ( 'User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36' )
opener = urllib.request.build_opener()
#将要伪装成的浏览器添加到对应的http头部
opener.addheaders = [headers]
#读取相应的url
data = opener. open (url).read()
#将获得的html解码为utf-8
data = data.decode( 'utf-8' )
return data
#处理数据,返回字典城市对应城市的url
def get_cityinfo(data):
city_info1 = re.findall(r 'dl>(.*?)</dl>' ,data,re.S)
city_info2 = re.findall(r '(<a.*?</a>)' ,city_info1[ 0 ],re.S)
city_dict = {}
for each in city_info2:
key = re.findall( '>(.*?)</' ,each,re.S)
city_url = re.findall( 'href="(.*?)"' ,each,re.S)
city_dict[key[ 0 ]] = city_url[ 0 ]
return city_dict
#获取所有的a标签,并从a标签中获取信息
def a_info(data):
a_info = re.findall(r '(<a.*?</a>)' ,data[ 0 ],re.S)
a_dict = {}
for each in a_info:
key = re.findall( '>(.*?)</' ,each,re.S)
a_url = re.findall( '"(.*?)"' ,each,re.S)
a_dict[key[ 0 ]] = a_url[ 0 ]
return a_dict
#获取对应城市信息的所有分类
def get_cityinfoclass():
#目前先获取招聘信息,后面有时间再继续补充完善这个模块
info = 'zhaopin/'
return info
#获取对应城市和其城市分类的信息
def getzhaopin(city_info,infoclass):
#先搞成都的信息招聘
city_url = city_info[ '成都' ]
cdzp_url = city_url + infoclass
cdzp_info = begin(cdzp_url)
return city_url,cdzp_info
#获取成都市招聘信息
def get_zhaopin_info(city_url,cdzp_info):
allzp_info = re.findall( 'class="f-all-news"(.*?)</div>' ,cdzp_info,re.S)
a_dict = {}
class_info = re.findall( '<dd>(.*?)</dd>' ,allzp_info[ 0 ],re.S)
for each in class_info:
a_info = re.findall(r '(<a.*?</a>)' ,each,re.S)
for each1 in a_info:
key = re.findall( '>(.*?)</' ,each1,re.S)
a_url = re.findall( 'href="/(.*?)"' ,each1,re.S)
a_dict[key[ 0 ].strip()] = city_url + a_url[ 0 ]
return a_dict
#获取招聘信息的具体内容
def get_city_zpinfo_detail(url):
#先获取软件工程师
sorft_engineer = (zp_class_info[ '软件工程师' ])
job_url_info = begin(sorft_engineer)
get_detail_info(job_url_info)
#处理详情页的信息
def get_detail_info(list_info):
job_info = re.findall( '<dl class="list-noimg job-list clearfix"(.*?)</dl' ,list_info,re.S)
print (job_info[ 0 ])
if __name__ = = '__main__' :
url = 'http://www.ganji.com/index.htm' ;
data = begin(url);
#所有城市信息
city_info = get_cityinfo(data)
#对应的分类
infoclass = get_cityinfoclass()
cdzp_url,xiaoshou = getzhaopin(city_info,infoclass)
#获取招聘的分类信息
zp_class_info = get_zhaopin_info(cdzp_url,xiaoshou)
get_city_zpinfo_detail(zp_class_info)
|
以上就是本文的全部内容,希望对大家的学习有所帮助。