抓取美团商家信息
1 import requests 2 from bs4 import BeautfulSoup 3 import json 4 5 #地区路径 6 url = \'http://km.meituan.com/\' 7 8 url_shop = \'http://km.meituan.com/shop/{}\' 9 10 #请求头 11 headers = { 12 13 \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\', 14 \'Accept-Encoding\':\'gzip, deflate, sdch\', 15 \'Accept-Language\':\'zh-CN,zh;q=0.8\', 16 \'Cache-Control\':\'max-age=0\', 17 \'DNT\':\'1\', 18 \'Host\':\'bj.meituan.com\', 19 \'Proxy-Connection\':\'keep-alive\', 20 \'Referer\':\'http://bj.meituan.com/shop/286725?acm=UwunyailsW15518532529028663069.286725.1&mtt=1.index%2Fdefault%2Fpoi.pz.1.j4cijrmg&cks=58899\', 21 \'Upgrade-Insecure-Requests\':\'1\', 22 \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\' 23 24 } 25 26 #得到所有的二级菜单头 27 def get_start_menu_links(): 28 html = requests.get(url).text 29 soup = BeautifulSoup(html, \'lxml\') 30 links = [link.find(\'div\').find(\'div\').find(\'dl\').find(\'dt\').find(\'a\')[\'href\'] for link in soup.find_all(\'div\',class_=\'J-nav-item\') ] 31 return links 32 33 34 def get_shop_ids(url, headers=None): 35 html = requests.get(url, headers=headers).text 36 soup = BeautifulSoup(html, \'lxml\') 37 content_id = json.loads(soup.find(\'div\', class_=\'J-scrollloader cf J-hub\')[\'data-async-params\']) 38 return json.loads(content_id.get(\'data\')).get(\'poiidList\') 39 40 41 def main(): 42 start_menu_links = get_start_menu_links() 43 for link in start_menu_links: 44 for pageNum in range(4,5): 45 category_url = link + \'/all/page{}\'.format(pageNum) 46 for shop_id in get_shop_ids(category_url, headers=headers): 47 html = requests.get(url_shop.format(shop_id), headers=headers).text 48 soup = BeautifulSoup(html, \'lxml\') 49 shop_detail = soup.find(\'div\', class_=\'summary biz-box fs-section cf\') 50 print("==================================pageNum %d shop_id: %d===================================================" % (pageNum,shop_id )) 51 try: 52 shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'h2\').find(\'span\').text 53 except: 54 continue 55 print("名称: " + shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'h2\').find(\'span\').text) 56 print("地址: " + shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'p\', class_=\'under-title\').find(\'span\').text) 57 print("联系方式: " + shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'p\', class_=\'under-title\').find_next_sibling().text) 58 59 60 if \'__main__\' == __name__: 61 main() 62