python爬虫——使用requests库和xpath爬取猎聘网职位详情

时间:2024-10-06 07:42:26
import requests from lxml import etree import time import random import csv from random import choice import http.client http.client._MAXLINE = 524288 base_url = '' positions_list = [] i = 0 z = 0 def get_proxies(ip_pool_name='https_ips_pool.csv'): with open(ip_pool_name, 'r') as f: datas = f.readlines() ran_num = random.choice(datas) ip = ran_num.strip().split(',') proxies = {ip[0]: ip[1] + ':' + ip[2]} return proxies def get_headers(): file = open('user_agent.txt', 'r') user_agent_list = file.readlines() user_agent = str(choice(user_agent_list)).replace('\n', '') user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" if len(user_agent) < 20 else user_agent headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Cookie": "__uuid=1576743949178.08; need_bind_tel=false; new_user=false; c_flag=a99aaaa31f739e3b04d3fa768574cabd; gr_user_id=bdb451db-1bc4-4899-a3b3-410b19c06161; bad1b2d9162fab1f80dde1897f7a2972_gr_last_sent_cs1=3872aec89444b8931a667e00ad0d9493; grwng_uid=6c0a08dc-d2e4-407b-b227-89fe9281943e; fe_work_exp_add=true; gr_session_id_bad1b2d9162fab1f80dde1897f7a2972=39311f19-2c25-419e-9a69-64940ae15c78; gr_cs1_39311f19-2c25-419e-9a69-64940ae15c78=UniqueKey%3A3872aec89444b8931a667e00ad0d9493; AGL_USER_ID=15f1f78f-e535-4ccc-8da0-ec0728eb9fb7; __s_bid=5ec9f0f87b044308fb05861763266522a1d4; imClientId=deef7ae9f2746887611c3686cabc4d86; imId=deef7ae9f2746887f5aceb762480da5b; imClientId_0=deef7ae9f2746887611c3686cabc4d86; imId_0=deef7ae9f2746887f5aceb762480da5b; abtest=0; fe_se=-1588144304556; Hm_lvt_a2647413544f5a04f00da7eee0d5e200=1587405434,1587440994,1587693344,1588144305; UniqueKey=3872aec89444b8931a667e00ad0d9493; lt_auth=vukLOidWzg%2Bv4iTRiTBf7fpI3Yr5VmTL%2FX0Mh0gJh4W6W%2FWw4PzqRQiDrbIPxAMhxxMmI8ULNLj2NOz3wHVI70IQwGmulICyv%2F2k03sEUeVkI8W2vezHg%2FXSQp4ilEAC8nJbpEIL%2BQ%3D%3D; access_system=C; user_roles=0; user_photo=; user_name=%E5%B8%B8%E4%BF%8A%E6%9D%B0; bad1b2d9162fab1f80dde1897f7a2972_gr_session_id=01e3d74e-99ae-4fbf-af09-101908fa9115; bad1b2d9162fab1f80dde1897f7a2972_gr_last_sent_sid_with_cs1=01e3d74e-99ae-4fbf-af09-101908fa9115; bad1b2d9162fab1f80dde1897f7a2972_gr_session_id_01e3d74e-99ae-4fbf-af09-101908fa9115=true; __tlog=1588144304624.83%7C00000000%7CR000000058%7Cs_00_t00%7Cs_00_t00; JSESSIONID=A6BB05126EF05E24DC5F8E80F0D3FFFE; __uv_seq=23; __session_seq=23; bad1b2d9162fab1f80dde1897f7a2972_gr_cs1=3872aec89444b8931a667e00ad0d9493; Hm_lpvt_a2647413544f5a04f00da7eee0d5e200=1588151024; fe_im_socketSequence_0=6_6_6", "Host": "", "Referer": "/zhaopin/?industries=&subIndustry=&dqs=&salary=&jobKind=&pubTime=&compkind=&compscale=&industryType=&searchType=1&clean_condition=&isAnalysis=&init=1&sortFlag=15&flushckid=0&fromSearchBtn=1&headckid=b8fa977dd1f04136&d_headId=83d82171062d54a659301a2193fa9e67&d_ckId=83d82171062d54a659301a2193fa9e67&d_sfrom=search_fp_bar&d_curPage=0&d_pageSize=40&siTag=bbL6aoW_xGX8iD8Yj4vLYw%7EfA9rXquZc5IkJpXC-Ycixw&key=", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": user_agent } return headers def get_detail_url(): for x in range(100): start_url = '/zhaopin/?isAnalysis=&dqs=&pubTime=&jobTitles=N000075%2CN000082%2CN000077&salary=&subIndustry=&industryType=&compscale=&key=&init=-1&searchType=1&headckid=5331cc6c28c3b7cf&compkind=&fromSearchBtn=2&sortFlag=15&ckid=5331cc6c28c3b7cf&degradeFlag=0&jobKind=&industries=&clean_condition=&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_prime&d_ckId=4a035a6b6d76db17ef8aca969d0a892a&d_curPage=2&d_pageSize=40&d_headId=4a035a6b6d76db17ef8aca969d0a892a&curPage={}'.format(x) print(start_url) global z z += 1 print("现在是第"+str(z)+"页!") while True: proxies = get_proxies() response = requests.get(start_url,headers=get_headers(),proxies = proxies) if response.status_code == 200: response = response.text tree = etree.HTML(response) detil_urls = tree.xpath("//div[@class='job-info']//a[@target='_blank']/@href") parse_url(detil_urls) time.sleep(1) break def parse_url(detil_urls): j = 0 global z global base_url global positions_list for url in detil_urls: if url.startswith('http'): print(url) while True: proxies = get_proxies() try: text = requests.get(url, headers=get_headers(), proxies=proxies) if text.status_code == 200: text = text.text tree = etree.HTML(text) title = tree.xpath("//div[@class='title-info']/h1/text()")[0].strip() company = tree.xpath("//div[@class='title-info']/h3/a/text()")[0].strip() salary = tree.xpath("//p[@class='job-item-title']/text()")[0].strip() location = tree.xpath("//p[@class='basic-infor']/span/a/text()") if location[0].strip() == '': location = location[1].strip() else: location = location[0].strip() # times = ("//p[@class='basic-infor']/time/@title")[0].strip() education = tree.xpath("//div[@class='job-qualifications']//text()")[1].strip() experience = tree.xpath("//div[@class='job-qualifications']//text()")[3].strip() language = tree.xpath("//div[@class='job-qualifications']//text()")[5].strip() age = tree.xpath("//div[@class='job-qualifications']//text()")[7].strip() contents = tree.xpath("//div[@class='content content-word']/text()") contents = list(map(lambda content: content.strip(), contents)) contents = "".join(contents) positions = { "title": title, "company": company, "salary": salary, "location":location, # "times":times, "education": education, "experience": experience, "language": language, "age": age, "contents": contents } positions_list.append(positions) break except Exception as t: print("exception:", t) else: url = base_url + url print(url) while True: proxies = get_proxies() try: text = requests.get(url,headers=get_headers(),proxies = proxies) if text.status_code == 200: text = text.text tree = etree.HTML(text) title = tree.xpath("//div[@class='title-info ']/h1/text()")[0].strip() company = tree.xpath("//div[@class='title-info ']/h3//text()")[0].strip() salary = tree.xpath("//p[@class='job-main-title']/text()")[0].strip() location = tree.xpath("//p[@class='basic-infor']/span//text()")[0].strip() # times = ("//p[@class='basic-infor']/time/@title")[0].strip() education = tree.xpath("//div[@class='resume clearfix']//text()")[1].strip() experience = tree.xpath("//div[@class='resume clearfix']//text()")[3].strip() language = tree.xpath("//div[@class='resume clearfix']//text()")[5].strip() age = tree.xpath("//div[@class='resume clearfix']//text()")[7].strip() contents = tree.xpath("//div[@class='content content-word']/text()") contents = list(map(lambda content: content.strip(), contents)) contents = "".join(contents) positions = { "title": title, "company": company, "salary": salary, "location":location, # "times":times, "education": education, "experience": experience, "language": language, "age": age, "contents": contents } positions_list.append(positions) break except Exception as t: print("exception:",t) j += 1 print("添加第" + str(z) + "页,第" + str(j) + "条数据!") global i i += 1 print("总共" + str(i) + "条数据!") save_files(positions_list) time.sleep(random.uniform(1, 2)) def save_files(positions_list): headers = ['title','company','salary','location','education','experience','language','age','contents'] with open("liepin_spider.csv",'w',encoding='utf-8',newline='') as f: writer = csv.DictWriter(f,headers) writer.writeheader() for positions in positions_list: writer.writerow(positions) f.close() if __name__ == '__main__': get_detail_url()