由于房天下房源并不是动态加载,属于静态页面,所以爬取起来非常简单,这是我们使用xpath 可以轻易获取信息,以下是爬取杭州新房信息源码,爬取内容为楼盘名称,地址,所在区以及价格
# coding:utf-8 import requests, xlwt, sys from lxml import etree reload(sys) sys.setdefaultencoding('utf-8') # 抓取网页内容 def getList(url): # 获取源码 webdata = requests.get(url) # 将源码按gb2312编码 webdata.encoding = "gb2312" text = webdata.text # 将源码转成html格式 html = etree.HTML(text) # 获取楼盘名称 name = html.xpath('//*[@class="nlcd_name"]/a') # 获取楼盘地址 adderess = html.xpath('//*[@class="address"]//@title') # 获取楼盘价格 price = html.xpath('//*[@class="nhouse_price"]//span|//*[@class="kanesf"]//p//a|//*[@class="kanzx"]//h3//a') new_price = [] district = [] info = [] # 把地址中的区拆分出来 for i in adderess: district.append(i.strip()[0:2]) # 对价格进行判断,分为已经售完 ,单价和总价三种 for i in price: j = i.text if j == "价格待定" or j == "新房直销" : new_price.append(j) elif j == "请查看二手房源": new_price.append("新房已售完") # 如果抓取的数字小于3000,则是写字楼,价格按每平方每天算 elif float(j) < 20: new_price.append(str(j) + "元/平方.天") # 如果抓取的数字小于3000,价格则是总价,在数字后面加上万元 elif float(j) < 3000: new_price.append(str(j)+"万元起") # 其他情况则按单价算 else: new_price.append(str(j)+"元/平方起") for i in range(0, len(name)-1): print(name[i].text.strip()+"\t"+adderess[i].strip()+"\t"+district[i]+"\t"+new_price[i]) # 将结果放入列表 info.append([name[i].text.strip().decode("utf-8"), adderess[i].strip().decode("utf-8"), district[i].decode("utf-8"), new_price[i].decode("utf-8")]) # 返回值为列表 return info # 创建excel 表 def createExcelAndSheet(excelName, sheetName, *colName): # 创建excel工作簿 exce = xlwt.Workbook() # 创建sheet sheet = exce.add_sheet(sheetName, cell_overwrite_ok=True) # 在第一行设置表头 for index, val in enumerate(colName): sheet.write(0, index, val) #插入数据 row_num = 1 page_num = 1 while True: print("page" + str(page_num)) url = "http://newhouse.hz.fang.com/house/s/b9" + str(page_num) + "/?ctm=1.hz.xf_search.page." + str(page_num) page_num = page_num + 1 info = getList(url) # 如果列表长度大于0,则表示有内容,否则页面为空跳出循环 if (len(info) > 0): for i, line in enumerate(info): sheet.write(row_num, 0, row_num) for j, element in enumerate(line): sheet.write(row_num, j+1, element) # print(str(row_num)+"\t"+str(j)+element) row_num = row_num + 1 else: break exce.save(excelName) if __name__ == "__main__": createExcelAndSheet("fangtianxia.xls", "hangzhou", "number", "name", "address", "district", "price")