import requests
from retrying import retry
from lxml import etree
import json
class DaCheng(object):
def __init__(self):
self.temp_url = "http://www.dachengnet.com/cn/professionals?currentPageNo={}&"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36"}
def get_url_list(self):
url_list = [self.temp_url.format(i) for i in range(1, 78)] # 使用列表推导式获取pageNo
return url_list
@retry(stop_max_attempt_number=3)
def _parse_url(self, url):
r = requests.get(url, headers=self.headers, timeout=3)
assert r.status_code == 200
return etree.HTML(r.content) # 返回一个xpath对象
def parse_url(self, url):
# print(url)
try:
html = self._parse_url(url)
except:
html = None
return html
def get_content_list(self, html):
tr_list = html.xpath("//tbody/tr")
content_list = []
for tr in tr_list:
item = dict()
# 姓名
item['Name'] = tr.xpath('./td[1]/a/text()')[0] if len(tr.xpath('./td[1]/a/text()')) > 0 else None
# 邮箱
item['Email'] = tr.xpath('./td[2]/text()')[0] if len(tr.xpath('./td[2]/text()')) > 0 else None
# 职位
item['Position'] = tr.xpath('./td[3]/text()')[0].strip().replace('\n', '').replace('\t', '') if len(
tr.xpath('./td[3]/text()')) > 0 else None
# 地点
item['Location'] = tr.xpath('./td[4]/text()')[0].strip().replace('\n', '').replace('\t', '') if len(
tr.xpath('./td[4]/text()')) > 0 else None
content_list.append(item)
# print(item)
return content_list
def save_content_list(self, content_list):
with open('DaCheng.json', 'a') as f:
for content in content_list:
json.dump(content, f, ensure_ascii=False, indent=2)
f.write(',\n')
print('保存成功')
def run(self):
# 1.获取首页url
url_list = self.get_url_list()
# 2.循环发送请求,获取响应
for url in url_list:
html = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(html)
# 4.保存
self.save_content_list(content_list)
if __name__ == '__main__':
dacheng = DaCheng()
dacheng.run()