【菜鸟学Python】使用Xpath爬取豆瓣读书

时间:2024-02-23 19:42:13
方法一:使用面向过程爬取1.0

import json
import requests
from lxml import etree

url = "https://www.douban.com/doulist/1264675/?start=0"
response = requests.get(url).content.decode()
dom = etree.HTML(response)
books = dom.xpath(\'//div[@class="article"]/div[@class="doulist-item"]\')
with open(\'book.json\', \'w\', encoding=\'utf-8\')as f:
    for book in books:
        item = {}
        item[\'title\'] = book.xpath(\'normalize-space(.//div/div[2]/div[3]/a/text())\')
        item[\'author\'] = book.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[1])\')
        item[\'year\'] = book.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[3])\')
        item[\'rank\'] = book.xpath(\'normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())\')
        f.write(json.dumps(item, ensure_ascii=False) + \',\n\')
方法一:使用面向过程爬取2.0(使用函数封装爬取)
import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {
            \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36\'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    dom = etree.HTML(html)
    node = dom.xpath(\'//div[@class="article"]/div[@class="doulist-item"]\')
    for i in range(25):
        book_value = node[i]
        yield {
            \'title\': book_value.xpath(\'normalize-space(.//div/div[2]/div[3]/a/text())\'),
            \'author\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[1])\'),
            \'year\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[3])\'),
            \'rank\': book_value.xpath(\'normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())\')
        }


def write_to_file(content):
    with open(\'result.json\', \'a\', encoding=\'utf-8\') as f:
        print(type(json.dumps(content)))
        f.write(json.dumps(content, ensure_ascii=False) + \'\n\')


def main(start):
    url = \'https://www.douban.com/doulist/1264675/?start=0\' + str(start)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == \'__main__\':
    for i in range(0, 250, 25):
        main(start=i)
        time.sleep(1)

方法二:使用面向对象爬取

import time
import json
import requests
from lxml import etree
from requests.exceptions import RequestException


class DouBanReadBook():
    """豆瓣读书排行"""

    # 初始化属性
    def __init__(self):

        self.url = \'https://www.douban.com/doulist/1264675/\'
        self.headers = {
            \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36\'
        }

    # 请求url获取html文本
    def get_one_page(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None

    def process_data(self, html):
        dom = etree.HTML(html)
        print(dom)
        node = dom.xpath(\'//div[@class="article"]/div[@class="doulist-item"]\')
        for i in range(25):
            book_value = node[i]
            yield {
                \'title\': book_value.xpath(\'normalize-space(.//div/div[2]/div[3]/a/text())\'),
                \'author\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[1])\'),
                \'year\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[3])\'),
                \'rank\': book_value.xpath(\'normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())\')
            }

    def save_file(self, content):
        with open(\'result.json\', \'a\', encoding=\'utf-8\') as f:
            print(type(json.dumps(content)))
            f.write(json.dumps(content, ensure_ascii=False) + \'\n\')

    def main(self, start):
        url = \'https://www.douban.com/doulist/1264675/?start=0\' + str(start)
        html = self.get_one_page(url)
        for item in self.process_data(html):
            print(item)
            self.save_file(item)


if __name__ == \'__main__\':
    read_book = DouBanReadBook()
    for i in range(0, 250, 25):
        # print(i)
        read_book.main(start=i)
        time.sleep(1)