爬取链家网站二手房房源信息,第一次做,仅供参考,要用scrapy。
import scrapy,pypinyin,requests
import bs4
from ..items import LianjiaItem
class LianjiaSpider(scrapy.Spider):
name = 'lianjia_dl'
allowed_domains = ['www.lianjia.com']
start_urls = []
url_0 = 'https://www.lianjia.com/city/'
res = requests.get(url_0)
bs_cs = bs4.BeautifulSoup(res.text,'html.parser')
xinxi_cs = bs_cs.find_all('div',class_='city_province')
for data_cs in xinxi_cs:
cs_s = data_cs.find('ul').find_all('li')
for cs_1 in cs_s:
yess = cs_1.find('a')['href']
if yess.find('fang')>=0: #若fang字符串在yess中,则yess.find('fang')是大于等于0的,显示在字符串中的位置
continue
else:
for x in range(100):
real_url = cs_1.find('a')['href']+'ershoufang/pg'+str(x+1)+'/'
start_urls.append(real_url)
def parse(self,response):
bs = bs4.BeautifulSoup(response.text,'html.parser')
datas = bs.find_all('div',class_='info clear')
for data in datas:
item = LianjiaItem()
item['xiaoqu'] = data.find('div',class_='address').find('a').text
da_list = data.find('div',class_='address').find('div',class_='houseInfo').text
da_li =da_list.split('|')
item['huxing'] = da_li[1].replace(' ','') # .replace(' ','') 去掉全部空格
item['mianji'] = da_li[2].replace(' ','')
item['chaoxiang'] = da_li[3].replace(' ','')
item['zhuangxiu'] = da_li[4].replace(' ','')
item['quyu'] = data.find('div',class_='flood').find('div',class_='positionInfo').text.split('-')[1].replace(' ','')
item['louceng'] = data.find('div',class_='flood').find('div',class_='positionInfo').text.split('-')[0].replace(' ','')
item['danjia'] = data.find('div',class_='priceInfo').find('div',class_='unitPrice').find('span').text.split('单价')[1].replace(' ','')
item['fangjia'] = data.find('div',class_='priceInfo').find('div',class_='totalPrice').text
yield item