核心算法代码分享如下:
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
import time
from selenium.webdriver.chrome.options import Options
import pymysql
import re
import json
#一线城市租房信息
#cities = ['bj','sh','gz','sz']
cities = ['sz']
options = Options()
driver = webdriver.Chrome(executable_path=r'chromedriver.exe', options=options)
def get_url_info(url):
driver.get(url)
#driver.set_page_load_timeout(60)
time.sleep(40)
driver.refresh()
driver.minimize_window()
zufang = driver.find_element(By.XPATH,'/html/body/div[3]/div[1]/div[1]/div/div[1]/div[1]/span[1]/a')
zufang.click()
#driver.close()
driver.switch_to.window(driver.window_handles[-1])
time.sleep(1)
page_source = driver.page_source;
hs = etree.HTML(page_source)
nums = driver.find_element(By.XPATH, '//*[@id="pager_wrap"]/div[@class="pager"]/a[@class="next"]/preceding-sibling::a[1]/span').text
if nums and len(nums)>0:
end = int(nums)
else:
end = 20;
for j in range(0,end+1):
ep_logs = hs.xpath('//ul[@class="house-list"]/li/@ep-log')
imgs = hs.xpath('//ul[@class="house-list"]/li[@ep-log]/div[@class="img-list"]/a/img/@src')
urls = hs.xpath('//ul[@class="house-list"]/li[@ep-log]/div[@class="des"]/h2/a/@href')
decs = hs.xpath('//ul[@class="house-list"]/li[@ep-log]/div[@class="des"]/h2/a/text()')
prices = hs.xpath('//ul[@class="house-list"]/li[@ep-log]/div[@class="list-li-right"]/div[@class="money"]/b/text()')
danweis = hs.xpath('//ul[@class="house-list"]/li[@ep-log]/div[@class="list-li-right"]/div[@class="money"]/b/following-sibling::text()')
next = driver.find_element(By.XPATH,'//*[@id="pager_wrap"]/div[@class="pager"]/a[@class="next"]')
length = len(ep_logs)
for i in range(0,length):
data = {}
dec = decs[i].split('|')
json_obj = json.loads(ep_logs[i])
print( json_obj['houseid'] )
#data['id'] = json_obj['houseid'] #房间ID
#if not_exists(houseid=data['id']):
if not_exists(houseid= json_obj['houseid'] ):
data['pic'] = imgs[i] #房间图片链接
data['url'] = urls[i] #房间URL链接
data['house_title'] = dec[1].strip() #房间标题
data['rent_way'] = dec[0].strip() #租房模式
data['house_pay'] = ''.join([prices[i],danweis[i].strip()]) #价格
time.sleep(3)
driver.get(data['url']) #进入详情页
ps_inner = driver.page_source
hs_inner = etree.HTML(ps_inner)
pay_way = hs_inner.xpath('//span[@class="instructions"]/text()')
#pay_way = hs_inner.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/div/span[2]/text()')
#type_str = hs_inner.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/ul/li[2]/span[2]/text()')
type_str = hs_inner.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')
#floor_str = hs_inner.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/ul/li[3]/span[2]/text()')
floor_str = hs_inner.xpath('//ul[@class="f14"]/li[3]/span[2]/text()')
# estate = hs_inner.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/ul/li[4]/span[2]/a/text()')
estate = hs_inner.xpath('//ul[@class="f14"]/li[4]/span[2]/a/text()')
# areas = hs_inner.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/ul/li[5]/span[2]/a[1]/text()')
areas = hs_inner.xpath('//ul[@class="f14"]/li[5]/span[2]/a/text()')
addresses = hs_inner.xpath('//span[@class="dz"]/text()')
#addresses = hs_inner.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/ul/li[6]/span[2]/text()')
#times = hs_inner.xpath('/html/body/div[3]/div[1]/p/text()')
times = hs_inner.xpath('//div[@class="house-title"]/p/text()')
agents = hs_inner.xpath('//*[@id="vipAgent"]/div[1]/p[1]/a/text()')
disposals = hs_inner.xpath('//ul[@class="house-disposal"]/li[not(@class="no-config")]/text()')
spots = hs_inner.xpath('//ul[@class="introduce-item"]/li[1]/span[2]/em/text()')
#descs = hs_inner.xpath('//ul[@class="introduce-item"]/li[2]/span[2]/em/text()')
descs = hs_inner.xpath('//ul[@class="introduce-item"]//li[3]/span[2]/text()')
print(descs)
if pay_way and len(pay_way)>0:
data['house_pay_way'] = pay_way[0]
if type_str and len(type_str)>0:
type_str = type_str[0]
types = type_str.split("\xa0\xa0")
if types and len(types)==3:
data['house_type'] = types[0]
data['house_area'] = types[1].split(' ')[0]+"平"
data['house_decora'] = types[2]
elif types and len(types)==2:
data['house_type'] = types[0]
data['house_area'] = types[1]
elif types and len(types)==1:
data['house_type'] = types[0]
if floor_str and len(floor_str)>0:
floor_str = floor_str[0]
floors = floor_str.split("\xa0\xa0")
if floors and len(floors) == 2:
data['toward'] = floors[0]
f = floors[1].split('/')
if f and len(f) == 2:
data['floor'] = f[0]
data['floor_height'] = f[1]
elif f and len(f) == 1:
data['floor'] = re.findall('\d{1,2}',f[0])[0]+'层'
elif floors and len(floors) == 1:
data['toward'] = floors[0]
if estate and len(estate)>0:
data['house_estate'] = estate[0].strip()
if areas and len(areas)>0:
data['area'] = areas[0]
if addresses and len(addresses)>0:
data['address'] = addresses[0].strip()
if times and len(times)>0:
times = times[len(times)-1].strip()
data['time'] = times.split('\xa0')[0]
print("时间:"+data['time'])
if agents and len(agents)>0:
data['agent_name'] = agents[0].strip()
if disposals and len(disposals)>0:
data['house_disposal'] = ' '.join(disposals).strip()
if spots and len(spots)>0:
data['house_spot'] = ' '.join(spots)
if descs and len(descs)>0:
data['house_desc'] = descs[0]
print(data)
to_mysql(data)
driver.back()
time.sleep(1)
next.click()
def not_exists(houseid):
"""
信息写入mysql
"""
table = 'house_info'
db = pymysql.connect(host='localhost', user='root', password='123456', port=3396, db='model')
cursor = db.cursor()
sql_search = "SELECT COUNT(1) FROM {table} where id = {id}".format(table=table,id=houseid)
cursor.execute(sql_search)
data_sql = cursor.fetchall()
count = data_sql[0][0]
if count > 0:
print('exists')
return False
return True
def to_mysql(data):
"""
信息写入mysql
"""
table = 'house_info'
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
db = pymysql.connect(host='localhost', user='root', password='123456', port=3396, db='model')
cursor = db.cursor()
sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
try:
datas = data.values()
if cursor.execute(sql, tuple(datas)):
print("Successful")
db.commit()
except:
print('Failed')
db.rollback()
db.close()
if __name__ == '__main__':
for i in cities:
url = 'XXXXX'%i
get_url_info(url)