对下面租房网址进行爬虫
首先将两页的所有房子链接取出来:
page = []
def get_page_link(page_num):
for i in range(1, page_num):
url = 'http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p{}-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF'.format(str(i))
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
all_link = soup.select('ul > li > div.result_btm_con.lodgeunitname')
for link in all_link:
# print(link.get('detailurl'))
# print(type(link.get('detailurl')))
page.append(str(link.get('detailurl')))
print('------------------')
get_page_link(3)
def get_information(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
title = soup.select('div > div.con_l > div.pho_info > h4')[0].get_text()
address = soup.select('div > div.con_l > div.pho_info > p > span')[0].get_text()
day_price = soup.select('div.day_l > span')[0].get_text()
imag = soup.select('#curBigImage')[0].get('src')
host_name = soup.select('div > div.w_240 > h6 > a')[0].get_text()
host_gender = soup.select('div.member_pic > div')[0].get('class')[0]
print(title, ' ', address, ' ', day_price, ' ', host_name, ' ', print_gender(host_gender))
全部代码
from bs4 import BeautifulSoup
import requests
import time
page = []
def print_gender(str):
if str == 'member_ico1':
return '女'
else:
return '男'
def get_information(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
title = soup.select('div > div.con_l > div.pho_info > h4')[0].get_text()
address = soup.select('div > div.con_l > div.pho_info > p > span')[0].get_text()
day_price = soup.select('div.day_l > span')[0].get_text()
imag = soup.select('#curBigImage')[0].get('src')
host_name = soup.select('div > div.w_240 > h6 > a')[0].get_text()
host_gender = soup.select('div.member_pic > div')[0].get('class')[0]
print(title, ' ', address, ' ', day_price, ' ', host_name, ' ', print_gender(host_gender))
def get_page_link(page_num):
for i in range(1, page_num):
url = 'http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p{}-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF'.format(str(i))
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
all_link = soup.select('ul > li > div.result_btm_con.lodgeunitname')
for link in all_link:
# print(link.get('detailurl'))
# print(type(link.get('detailurl')))
page.append(str(link.get('detailurl')))
print('------------------')
def main():
get_page_link(3)
for link in page:
get_information(link)
time.sleep(2)
if __name__ == '__main__':
main()