看我如何抓取最新房价数据

时间:2024-12-09 21:34:50
# 创建文件准备写入with open("", "w", encoding='utf-8') as f: # 获得需要的新房数据 total_page = 1 loupan_list = list() page = '/loupan/' # 调用请求头 headers = create_headers() # 请求 url 并返回结果 response = requests.get(page, timeout=10, headers=headers) html = response.content # 解析返回 html soup = BeautifulSoup(html, "lxml") # 获取总页数 try: page_box = soup.find_all('div', class_='page-box')[0] matches = ('.*data-total-count="(\d+)".*', str(page_box)) total_page = int((int(matches.group(1)) / 10)) except Exception as e: print(e) print('总页数:' + total_page) # 配置请求头 headers = create_headers() # 从第一页开始遍历 for i in range(1, total_page + 1): page = '/loupan/pg{0}'.format(i) print(page) response = requests.get(page, timeout=10, headers=headers) html = response.content # 解释返回结果 soup = BeautifulSoup(html, "lxml") # 获得小区信息 house_elements = soup.find_all('li', class_="resblock-list") # 循环遍历获取想要的元素 for house_elem in house_elements: price = house_elem.find('span', class_="number") desc = house_elem.find('span', class_="desc") total = house_elem.find('div', class_="second") loupan = house_elem.find('a', class_='name') # 开始清理数据 try: price = price.text.strip() + desc.text.strip() except Exception as e: price = '0' loupan = .replace("\n", "") # 继续清理数据 try: total = ().replace(u'总价', '') total = (u'/套起', '') except Exception as e: total = '0' # 作为对象保存到变量 loupan = NewHouse(loupan, price, total) print(()) # 将新房信息加入列表 loupan_list.append(loupan) # 循环获取的数据并写入到文件中 for loupan in loupan_list: f.write(() + "\n")