爬虫实例:饿了么爬虫

时间:2024-03-05 10:28:01

 

饿了么外卖网站是一个ajax动态加载的网站

Version1:直接页面提取

from lxml import etree
import requests
import sys
import time

reload(sys)
sys.setdefaultencoding(\'utf-8\')


url = \'https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232\'
response = requests.get(url)
print response.status_code

time.sleep(10)
html = response.content
selector = etree.HTML(html)
rez = selector.xpath(\'//*[@class="place-rstbox clearfix"]\')
print \'haha\',rez  #[]
for i in rez:
    Name = i.xpath(\'//*[@class="rstblock-title"]/text()\')
    print name
    msales = i.xpath(\'//*[@class="rstblock-monthsales"]/text()\')
    tip = i.xpath(\'//*[@class="rstblock-cost"]/text()\')
    stime = i.xpath(\'//*[@class="rstblock-logo"]/span/text()\')

print u\'店名\'
for j in Name:
    print j
    break

问题:根据//*[@class="place-rstbox clearfix"]xpath提取成功,但是rez输出为空

 

Version2:通过接口提取

geohash=ws101hcw982&latitude=22.52721&longitude=113.95232:位置信息参数及参数值

terminal=web:渠道信息

extras[]=activities和offset=0未知

import requests
import json

url = \'https://www.ele.me/restapi/shopping/restaurants?extras[]=activities&geohash=ws101hcw982&latitude=22.52721&limit=30&longitude=113.95232&offset=0&terminal=web\'
resp = requests.get(url)
print  resp.status_code    

Jdata = json.loads(resp.text)
#print Jdata

for n in Jdata:
    name = n[\'name\']
    msales = n[\'recent_order_num\']
    stime = n[\'order_lead_time\']
    tip = n[\'description\']
    phone = n[\'phone\']
    print name

输出:原以为通过limit=100就可以提取100条商家信息,然而最多只显示30

 Version3:通过selenium提取

from selenium import webdriver
import selenium.webdriver.support.ui as ui
import time

driver = webdriver.PhantomJS(executable_path=r"C:\Python27\phantomjs.exe")
#driver = webdriver.Chrome()
driver.get(\'https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232\')
time.sleep(10)
driver.get_screenshot_as_file("E:\\Elm_ok.jpg")

wait = ui.WebDriverWait(driver,10)
wait.until(lambda driver: driver.find_element_by_xpath(\'//div[@class="place-rstbox clearfix"]\'))

name = driver.find_element_by_xpath(\'//*[@class="rstblock-title"]\').text
msales = driver.find_element_by_xpath(\'//*[@class="rstblock-monthsales"]\').text
tip = driver.find_element_by_xpath(\'//*[@class="rstblock-cost"]\').text
stime = driver.find_element_by_xpath(\'//*[@class="rstblock-logo"]/span\').text

print name  #乐凯撒比萨(生态园店)

注:find_element只提取一个

改进版

#coding=utf-8
from selenium import webdriver
import selenium.webdriver.support.ui as ui
import time


driver = webdriver.PhantomJS(executable_path=r"C:\Python27\phantomjs.exe")
#driver = webdriver.Chrome()
driver.get(\'https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232\')
time.sleep(10)
#driver.get_screenshot_as_file("E:\\Elm_ok.jpg")

wait = ui.WebDriverWait(driver,10)
wait.until(lambda driver: driver.find_element_by_xpath(\'//div[@class="place-rstbox clearfix"]\'))
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  #滚动至底部页面
def execute_times(times):
    for i in range(times + 1):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
execute_times(20)

name = driver.find_elements_by_xpath(\'//*[@class="rstblock-title"]\')
msales = driver.find_elements_by_xpath(\'//*[@class="rstblock-monthsales"]\')
tip = driver.find_elements_by_xpath(\'//*[@class="rstblock-cost"]\')
stime = driver.find_elements_by_xpath(\'//*[@class="rstblock-logo"]/span\')

#print name,msales,stime,tip  #[<selenium.webdriver.remote.webelement.WebElement (session="c941cfb0-a428-11e7-affa-f38716880ab3",...]
print type(tip)  #<type \'list\'>
print len(name)  #120

for i in name:
    print i.text

说明:通过execute_times函数,滚动条每下移一次,休息5s,从而使页面加载更多的商家信息

输出: