爬虫小实验

时间:2025-02-21 08:40:28
import requests, json, time, pymongo, random from bs4 import BeautifulSoup # client = () # ganji = client['ganji'] # url_list = ganji['url_list'] # item_info = ganji['item_info'] headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/601.4.4 (KHTML, like Gecko) Version/9.0.3 Safari/601.4.4', 'Connection':'Keep-alive' } # proxy_list = [ 'http://120.41.0.29:8080', 'http://58.20.132.23:8088', 'http://121.140.126.250:3128' ] proxy_ip = (proxy_list) proxies = {'http': proxy_ip} def get_item_info_from(url, data=None): wb_data = (url,headers=headers) if wb_data.status_code == 404: pass else: try: soup = BeautifulSoup(wb_data.text,"lxml") data = { 'leibie': ('#header > .f12 > span.crb_i > a')[1].text, 'title': (), 'pub_data': ('#index_show > ul.mtit_con_left.fl > ')[0].text, 'price': ('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')[0].(), 'area': list(map(lambda x: ,('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span > a'))) #'chengse': (''), # 'url': url } print(data) except AttributeError: pass except IndexError: pass get_item_info_from('http://bj./pingbandiannao/')