利用python爬取某壳的房产数据
import requests
from pyquery import PyQuery as pq
import json
import pandas as pd
import datetime,time
columns = ['id','title','place','msg', 'price', 'per_meter','area','city']
areas=['滨湖区','梁溪区','新吴区','惠山区','锡山区','江阴市','宜兴市']
# 爬取某网页
def get_a_page(url,area):
result = requests.get(url)
doc = pq(result.text)
ul = doc('.sellListContent')
divs = ul.children('.clear .info.clear').items()
count = 0
realids=[]
titles = []
places = []
msgs = []
prices = []
per_meters = []
realarea=[]
citys=[]
for div in divs:
count += 1
realid=div.children('.address .priceInfo .unitPrice').attr('data-hid')
title = div.children('.title a').text()
place = div.children('.address .flood .positionInfo a').text()
msg = div.children('.address .houseInfo').text()
price = div.children('.address .priceInfo .totalPrice span').text()
per_meter = div.children('.address .priceInfo .unitPrice span').text()
city='无锡'
dict = {
'id':realid,
'title': title,
'place': place,
'msg': msg,
'price': price,
'per_meter': per_meter,
'area':areas[area],
'city':'无锡'
}
realids.append(realid)
titles.append(title)
places.append(place)
msgs.append(msg)
prices.append(price)
per_meters.append(per_meter)
realarea.append(areas[area])
citys.append(city)
print(str(count) + ':' + json.dumps(dict, ensure_ascii=False))
datas={
'id':realids,
'title': titles,
'place': places,
'msg': msgs,
'price': prices,
'per_meter': per_meters,
'area':realarea,
'city':citys
}
df = pd.DataFrame(data=datas, columns=columns)
df.to_csv('wx'+time.strftime('%Y-%m-%d')+'.csv', mode='a', index=False, header=False)
if __name__ == '__main__':
quyu=['binhu','liangxi','xinwu','huishan','xishan','jiangyinshi','yixingshi']
index=0
for qy in quyu:
#print (index)
#print (qy)
for i in range(1, 20):
get_a_page(f'https://wx.ke.com/ershoufang/{qy}/pg{i}tt9/',index)
index=index+1