第一次爬虫,纪念下

时间:2022-05-27 06:13:15

对下面租房网址进行爬虫

http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p2-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF


首先将两页的所有房子链接取出来:

page = []


def get_page_link(page_num):
for i in range(1, page_num):
url = 'http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p{}-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF'.format(str(i))
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
all_link = soup.select('ul > li > div.result_btm_con.lodgeunitname')
for link in all_link:
# print(link.get('detailurl'))
# print(type(link.get('detailurl')))
page.append(str(link.get('detailurl')))
print('------------------')


get_page_link(3)


然后对于每一个链接进行获取里面房子信息
def get_information(url):

wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
title = soup.select('div > div.con_l > div.pho_info > h4')[0].get_text()
address = soup.select('div > div.con_l > div.pho_info > p > span')[0].get_text()
day_price = soup.select('div.day_l > span')[0].get_text()
imag = soup.select('#curBigImage')[0].get('src')
host_name = soup.select('div > div.w_240 > h6 > a')[0].get_text()
host_gender = soup.select('div.member_pic > div')[0].get('class')[0]

print(title, ' ', address, ' ', day_price, ' ', host_name, ' ', print_gender(host_gender))


全部代码


from bs4 import BeautifulSoup
import requests
import time


page = []


def print_gender(str):
if str == 'member_ico1':
return '女'
else:
return '男'


def get_information(url):

wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
title = soup.select('div > div.con_l > div.pho_info > h4')[0].get_text()
address = soup.select('div > div.con_l > div.pho_info > p > span')[0].get_text()
day_price = soup.select('div.day_l > span')[0].get_text()
imag = soup.select('#curBigImage')[0].get('src')
host_name = soup.select('div > div.w_240 > h6 > a')[0].get_text()
host_gender = soup.select('div.member_pic > div')[0].get('class')[0]

print(title, ' ', address, ' ', day_price, ' ', host_name, ' ', print_gender(host_gender))


def get_page_link(page_num):
for i in range(1, page_num):
url = 'http://bj.xiaozhu.com/%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF_lysys-duanzufang-p{}-20/?putkey=%E5%9C%B0%E9%93%81%E4%B9%9D%E5%8F%B7%E7%BA%BF'.format(str(i))
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
all_link = soup.select('ul > li > div.result_btm_con.lodgeunitname')
for link in all_link:
# print(link.get('detailurl'))
# print(type(link.get('detailurl')))
page.append(str(link.get('detailurl')))
print('------------------')


def main():
get_page_link(3)
for link in page:
get_information(link)
time.sleep(2)

if __name__ == '__main__':
main()


运行可得到如下信息



第一次爬虫,纪念下