利用网络爬虫从猎聘网抓取IT类招聘信息及其数据可视化分析(python+mysql未使用爬虫框架)

时间:2024-10-06 07:51:54
  • """
  • coding utf-8
  • release python3.6
  • """
  • import requests
  • import lxml
  • import re
  • import pymysql
  • from bs4 import BeautifulSoup
  • from multiprocessing import Pool
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def getTableName(ID):
  • """
  • 有些分类标识符ID中带有MySql数据库表名不支持的符号,该函数返回合法表名
  • """
  • replaceDict={
  • "":"NodeJS",
  • ".NET":"NET",
  • "C#":"CC",
  • "C++":"CPP",
  • "COCOS2D-X":"COCOS2DX"
  • }
  • if ID in replaceDict:
  • return replaceDict[ID]
  • else:
  • return ID
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def parseWage(wage):
  • """
  • 该函数实现了解析工资字符串wage,如果是'面议'或者其它则返回列表[0,0](代表工资面议),否则返回
  • 相应工资(数值类型,单位为万)
  • """
  • parsedResult=('(.*?)-(.*?)万.*?',wage,)
  • if not parsedResult:
  • return [0,0]
  • else:
  • return [parsedResult[0][0],parsedResult[0][1]]
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def table_exists(cursor,table_name):
  • """
  • 该函数实现判断某一个表是否在数据库方案里存在,存在返回True,不存在返回False
  • """
  • sql = "show tables;"
  • (sql)
  • tables = [()]
  • table_list = ('(\'.*?\')',str(tables))
  • table_list = [("'",'',each) for each in table_list]
  • if table_name in table_list:
  • return True
  • else:
  • return False
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def isUrlValid(url):
  • """
  • 由于在爬虫运行过程中发现有些类别招聘信息中含有的详细招聘信息的入口地址在获取响应的时候会抛出Missing Schema异常,
  • 发现是url中有些是.../job/...(往往是猎聘网自己发布的招聘信息),有些是.../a/...(这类招聘信息通常是代理发布),
  • 导致无法解析,从而使爬虫运行到此处时停止抓取数据。
  • 该函数实现对代理发布的URL进行过滤,若为代理发布的信息,则跳过该条招聘信息,函数返回False,否则返回True。
  • """
  • isValid=('.*?www\.liepin\.com/job/.*?$',url,)
  • if isValid:
  • return True
  • else:
  • return False
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def getPageHtml(url,headers=None):
  • """
  • 返回服务器响应页面的html,不成功返回None
  • """
  • if not headers:
  • headers={
  • "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
  • }
  • try:
  • response=(url,headers=headers)
  • if response.status_code==200:
  • return
  • else:
  • return None
  • except as e:
  • #debug
  • print('Exception occur in funciton getPageHtml()')
  • return None
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def getEntry(html):
  • """
  • 解析Html,该函数为生成器类型,每一次迭代返回某一子项目的入口地址URL和description组成的字典entry
  • """
  • if not html:
  • #html为None则返回None,无法从该html中解析出子项目入口地址
  • #debug
  • print('html is None in function getEntry()')
  • return None
  • soup=BeautifulSoup(html,'lxml')
  • for items in soup.find_all(name='li'):
  • for item in items.find_all(name='dd'):
  • for usefulURL in item.find_all(name='a',attrs={"target":"_blank","rel":"nofollow"}):
  • yield{
  • "URL":''+['href'],
  • "URL_Description":
  • }
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def getCountryEntry(entry):
  • """
  • entry为子项目地址URL和描述URL_Description组成的字典,该函数实现了从子项目页面信息中获取响应,并
  • 且最终返回全国子项目地址CountryURL和CountryURLDescription(实际上就是URL_Description)组成的字典
  • """
  • if not entry:
  • #debug
  • print('ERROR in function getCountryEntry:entry is None')
  • return None
  • headers={
  • "Host":"",
  • "Referer":"/it/",
  • "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
  • }
  • countryHtml=getPageHtml(entry['URL'],headers=headers)
  • soup=BeautifulSoup(countryHtml,'lxml')
  • citiesInfo=(name='dd',attrs={"data-param":"city"})
  • if not citiesInfo:
  • #debug
  • print('ERROR in function getCountryEntry():citiesInfo is None.')
  • return None
  • db=(host='localhost',user='root',password='123456',port=3306,db='spider')
  • cursor=()
  • if not table_exists(cursor,entry['URL_Description']):
  • createTableSql="""CREATE TABLE IF NOT EXISTS spider.{} like ;""".format(getTableName(entry['URL_Description']))
  • try:
  • (createTableSql)
  • print('--------------create table %s--------------------' % (entry['URL_Description']))
  • except:
  • print('error in function getCountryEntry():create table failed.')
  • finally:
  • ()
  • return {
  • "CountryURL":""+(name='a',attrs={"rel":"nofollow"}).attrs['href'],
  • "CountryURLDescription":entry['URL_Description']
  • }
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def getCountryEmployeeInfo(CountryEntry):
  • """
  • CountryEntry是getCountryEntry函数返回的由全国招聘信息CountryURL和地址分类描述
  • CountryURLDescription构成的字典,该函数提取出想要的信息
  • """
  • if not CountryEntry:
  • #debug
  • print('ERROR in function getCountryEmpolyeeInfo():CountryEntry is None.')
  • return None
  • db=(host='localhost',user='root',password='123456',port=3306,db='spider')
  • cursor=()
  • indexOfPage=0
  • theMaxLength=0
  • #遍历该类招聘信息的每一页
  • while indexOfPage<=theMaxLength:
  • URL=CountryEntry['CountryURL']+'&curPage='+str(indexOfPage)
  • pageHtml=getPageHtml(URL)
  • soup=BeautifulSoup(pageHtml,'lxml')
  • #提取出该类招聘信息总共的页数,仅需要提取一次即可
  • if indexOfPage==0:
  • prepareReString=(name='a',attrs={"class":"go","href":"javascript:;"}).attrs['onclick']
  • pattern=re.compile('Math\.min\(Math\.max\(\$pn,\s\d\),(.*?)\)')
  • theMaxLength=int((pattern,prepareReString)[0])
  • #debug,检测访问到第几页
  • print('Accessing page {} of {}'.format(indexOfPage,CountryEntry['CountryURLDescription']))
  • #进入下一页
  • indexOfPage+=1
  • """
  • 这里代码实现对信息的有效信息的提取
  • """
  • for detailedDescriptionURL in getDetailedDescriptionURL(soup):
  • #如果详细招聘信息入口URL是代理发布(即无效,这里不爬取这类信息),则跳过该条招聘信息
  • if not isUrlValid(detailedDescriptionURL):
  • continue
  • detailedDescriptionHtml=getPageHtml(detailedDescriptionURL)
  • #将分区标识符(例如java、php等)添加进返回的字典
  • result=detailedInformation(detailedDescriptionHtml)
  • result['ID']=CountryEntry['CountryURLDescription']
  • """
  • if 'ID' in result:
  • print(type(result['ID']),'>>>',result)
  • """
  • if result['Available']:
  • #获取工资最小值和最大值
  • min_max=parseWage(result['wage'])
  • #有些公司没有福利tag
  • reallyTag=''
  • if not result['tag']:
  • reallyTag='无'
  • else:
  • reallyTag=result['tag']
  • insertSql="""insert into spider.{} values(0,'{}','{}','{}',{},{},'{}','{}','{}','{}','{}','{}','{}');""".format(getTableName(result['ID']),result['position'],result['company'],result['wage'],min_max[0],min_max[1],result['education'],result['workExperience'],result['language'],result['age'],result['description'],reallyTag,result['workPlace'])
  • try:
  • (insertSql)
  • ()
  • except:
  • ()
  • #debug
  • print('ERROR in function getCountryEmployeeInfo():execute sql failed.')
  • #爬取完该类招聘信息之后关闭数据库连接
  • ()
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def getDetailedDescriptionURL(soup):
  • """
  • soup为全国招聘信息列表页面解析的BeautifulSoup对象,该函数为生成器,每一次迭代产生一条招聘信息
  • 详细内容的URL字符串
  • """
  • if not soup:
  • #debug
  • print('ERROR in function getDetailedDescroption():soup is None.')
  • return None
  • for item in soup.find_all(name='div',attrs={"class":"job-info"}):
  • detailedDescriptionURL=(name='a',attrs={"target":"_blank"}).attrs['href']
  • yield detailedDescriptionURL
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • def detailedInformation(detailedDescriptionHtml):
  • """
  • 该函数实现对具体的一条详细招聘信息的提取,detailedDescriptionHtml为一条详细招聘信息网页的
  • HTML,该函数返回值为职位具体要求构成的字典positionDescription
  • """
  • if not detailedDescriptionHtml:
  • #debug
  • print('ERROR in function detailedInformation():detailedDescriptionHtml is None.')
  • return None
  • soup=BeautifulSoup(detailedDescriptionHtml,'lxml')
  • #提取出招聘职位和公司,类型为str
  • positionItem=(name='div',attrs={"class":"title-info"})
  • #有时候招聘信息被删除了但是招聘信息的入口仍然留在招聘列表中,这里就是防止这种情况导致运行失败
  • if not positionItem:
  • return {
  • 'Available':False
  • }
  • position=positionItem.
  • company=(name='div',attrs={"class":"title-info"}).
  • #提取出工资水平(类型为str,有些是面议)、工作地点、学历要求、工作经验、语言要求和年龄要求
  • items=(name='div',attrs={"class":"job-title-left"})
  • wage=(name='p',attrs={"class":"job-item-title"}).('\r')[0]
  • workPlace=(name='a')
  • #有些工作地点在国外,该网站不提供该地区招聘信息的网页,没有标签a,这里就是处理这样的异常情况
  • if not workPlace:
  • workPlace=(name='p',attrs={"class":"basic-infor"}).()
  • else:
  • workPlace=
  • #这里返回一个大小为4的列表,分别对应学历要求、工作经验、语言要求、年龄要求
  • allFourNec=(name='div',attrs={"class":"job-qualifications"}).find_all(name='span')
  • #有些招聘信息中带有公司包含的福利tag,这里也提取出来,所有tag组成一个由-分隔的字符串,没有则为空字符串
  • tagItems=(name='ul',attrs={"class":"comp-tag-list clearfix","data-selector":"comp-tag-list"})
  • tags=''
  • if tagItems:
  • tempTags=[]
  • for tag in tagItems.find_all(name='span'):
  • ()
  • tags='-'.join(tempTags)
  • #提取出详细的职位技能要求
  • descriptionItems=(name='div',attrs={"class":"job-item main-message job-description"})
  • description=(name='div',attrs={"class":"content content-word"}).()
  • positionDescription={
  • "Available":True,
  • "position":position,
  • "company":company,
  • "wage":wage,
  • "workPlace":workPlace,
  • "education":allFourNec[0].text,
  • "workExperience":allFourNec[1].text,
  • "language":allFourNec[2].text,
  • "age":allFourNec[3].text,
  • "tag":tags,
  • "description":description,
  • }
  • return positionDescription
  • """--------------------------------------------------------------------------------------------------------------
  • --------------------------------------------------------------------------------------------------------------"""
  • if __name__=="__main__":
  • startURL='/it/'
  • startHtml=getPageHtml(startURL)
  • #多进程抓取数据
  • pool=Pool(4)
  • for entry in getEntry(startHtml):
  • countryEntry=getCountryEntry(entry)
  • pool.apply_async(getCountryEmployeeInfo,args=(countryEntry,))
  • ()
  • ()
  • print('All subprocesses done.')
  • """
  • for entry in getEntry(startHtml):
  • countryEntry=getCountryEntry(entry)
  • getCountryEmployeeInfo(countryEntry)
  • """
  • """
  • pool=Pool()
  • countryEntry0={
  • "CountryURL":("/zhaopin/?init=-1&headckid=8e288c3d8c61908a&flushckid=1&dqs=&fromSear"
  • "chBtn=2&imscid=R000000035&ckid=8e288c3d8c61908a&key=%E8%87%AA%E7%84%B6%E8%AF%AD%E8%A8%80%E5"
  • "%A4%84%E7%90%86&siTag=9IxD9f0Uv9ilBXoBMeeqIA~O7qm6Hv6o5wKSctHWDgu-A&d_sfrom=search_unknown&"
  • "d_ckId=c5822d371553018b209ef5a7ed4603e0&d_curPage=0&d_pageSize=40&d_headId=c5822d371553018b"
  • "209ef5a7ed4603e0"),
  • "CountryURLDescription":"自然语言处理"
  • }
  • countryEntry1={
  • "CountryURL":("/zhaopin/?init=-1&headckid=e6fcd147b81c0d37&flushckid=1&dqs=&fromSear"
  • "chBtn=2&imscid=R000000035&ckid=e6fcd147b81c0d37&key=Ruby&siTag=mRbR_Fn-IswEai_hYVvHZA~O7qm6"
  • "Hv6o5wKSctHWDgu-A&d_sfrom=search_unknown&d_ckId=e03e58329b38476c3e348553f0cc0231&d_curPage="
  • "0&d_pageSize=40&d_headId=e03e58329b38476c3e348553f0cc0231"),
  • "CountryURLDescription":"Ruby"
  • }
  • countryEntry2={
  • "CountryURL":("/zhaopin/?init=-1&headckid=1a63ce33c5581803&flushckid=1&dqs=&fromSear"
  • "chBtn=2&imscid=R000000035&ckid=1a63ce33c5581803&key=%E5%8A%9F%E8%83%BD%E6%B5%8B%E8%AF%95&si"
  • "Tag=AQs5v9-xdx5UdW-LpKsEPA~O7qm6Hv6o5wKSctHWDgu-A&d_sfrom=search_unknown&d_ckId=7a78481b65a"
  • "183fc4af27e9c7d7a8c6e&d_curPage=0&d_pageSize=40&d_headId=7a78481b65a183fc4af27e9c7d7a8c6e"),
  • "CountryURLDescription":"功能测试"
  • }
  • countryEntry3={
  • "CountryURL":("/zhaopin/?init=-1&headckid=2e6d76848af03a10&flushckid=1&dqs=&fromSear"
  • "chBtn=2&imscid=R000000035&ckid=2e6d76848af03a10&key=%E6%80%A7%E8%83%BD%E6%B5%8B%E8%AF%95&si"
  • "Tag=3dIhGakkNW1f2XBXKFwGiQ~O7qm6Hv6o5wKSctHWDgu-A&d_sfrom=search_unknown&d_ckId=d4c7f7328e8"
  • "776875fae4943831fc185&d_curPage=0&d_pageSize=40&d_headId=d4c7f7328e8776875fae4943831fc185"),
  • "CountryURLDescription":"性能测试"
  • }
  • entryGroups=([countryEntry0,countryEntry1,countryEntry2,countryEntry3])
  • (getCountryEmployeeInfo,entryGroups)
  • ()
  • ()
  • print("++++++++++++++++++++++++++++++++++++++All subprocesses done++++++++++++++++++++++++++++++++++++++++++++++++++++")
  • """
  • """
  • for entry in getEntry(startHtml):
  • getCountryEntry(entry)
  • """