链家网深圳各区租房信息爬取(多线程爬取)

时间:2024-02-16 18:06:36

代码:

import requests
from pyquery import PyQuery as pq
import csv
from threading import Thread,Lock
import time
import os

def get_one_url(url):

    try:
        headers={
            \'Referer\':\'https: // s1.ljcdn.com / matrix_pc / dist / pc / src / common / css / common.css?_v = 20191213202326259\',
            \'User - Agent\':\'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 76.0.3809.132Safari / 537.36\',

        }
        # 获取响应
        response=requests.get(url,headers,proxies=None)
        # 判断请求状态
        if response.status_code==200:
            return response.text
        return None
    except requests.exceptions.ConnectionError as e:
        print(\'error\',e.args)

def parse_one_url(html):
    doc=pq(html)
    items=doc(\'.content__list--item--main\').items()
    for item in items:
        # 上锁,等待每个线程的解析执行完之后在执行其他线程
        mutes.acquire()
        info= {
            \'content\': item.find(\'a\').text().split()[0],
            \'room\': item.find(\'a\').text().split()[1],
            \'direction\': item.find(\'a\').text().split()[2],
            \'size\':item.find(\'p.content__list--item--des\').text().split(\'/\')[1],
            \'price\':item.find(\'.content__list--item-price\').text(),
            \'location\':item.find(\'.content__list--item--des a\').text().replace(\' \',\'-\')
        }
        # 释放互斥锁
        mutes.release()
        print(info)
        # 生成器,使循环结束之后再返回
        yield info

# 创建互斥锁
mutes = Lock()

def man(page,city):
    url = \'https://sz.lianjia.com/zufang/%s/pg%d/#contentList\'%(city,page)
    html = get_one_url(url)
    infos=parse_one_url(html)
    save(infos,city)


def save(infos,city):
    \'\'\'
    判断文件是否存在,如果存在就说明表头已经写了就不写表头,反之加上表头
    :param infos: 保存的信息
    :return:
    \'\'\'
    if os.path.exists(\'%s.csv\' %(city)):
        with open(\'链家租房.csv\', \'a\', encoding=\'utf-8\') as csvfile:
            fieldname = [\'content\', \'room\', \'direction\', \'size\', \'location\', \'price\']
            writer = csv.DictWriter(csvfile, fieldname)
            for info in infos:
                writer.writerow(info)
    else:
        with open(\'%s.csv\' %(city),\'a\',encoding=\'utf-8\') as csvfile:
            fieldname=[\'content\',\'room\',\'direction\',\'size\',\'location\',\'price\']
            writer = csv.DictWriter(csvfile,fieldname) #DictWriter方法使csv文件可以写入字典
            writer.writeheader()
            for info in infos:
                writer.writerow(info)


if __name__==\'__main__\':
    cities = [\'luohuqu\', \'longhuaqu\', \'futianqu\']
    # 循环深圳各区的缩写
    for city in cities:
        ts =[]
        # 创建多个线程分页爬取,使爬虫效率大大提高
        for i in range(1,50):
            exec(\'t{0} = Thread(target=man,args=(i,city))\'.format(i))
            exec(\'ts.append(t{0})\'.format(i))
        for t in ts:
            t.start()
        # 等待所有线程全部执行结束
        time.sleep(4)
        print(\'%s租房信息存入成功\' %(city))
    print(\'全部存入成功\')

  改进:可以动态爬取,用户输入想要爬取的深圳区域,程序自动爬取并保存;可以多进程爬取,同时爬取不同区域的住房信息并分别保存不同的csv文件;如果怕被封ip,还可以连接代理的 api,用代理爬取。