方法一:使用面向过程爬取1.0
import json import requests from lxml import etree url = "https://www.douban.com/doulist/1264675/?start=0" response = requests.get(url).content.decode() dom = etree.HTML(response) books = dom.xpath(\'//div[@class="article"]/div[@class="doulist-item"]\') with open(\'book.json\', \'w\', encoding=\'utf-8\')as f: for book in books: item = {} item[\'title\'] = book.xpath(\'normalize-space(.//div/div[2]/div[3]/a/text())\') item[\'author\'] = book.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[1])\') item[\'year\'] = book.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[3])\') item[\'rank\'] = book.xpath(\'normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())\') f.write(json.dumps(item, ensure_ascii=False) + \',\n\')
方法一:使用面向过程爬取2.0(使用函数封装爬取)
import time import json import requests from lxml import etree from requests.exceptions import RequestException def get_one_page(url): try: headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36\'} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): dom = etree.HTML(html) node = dom.xpath(\'//div[@class="article"]/div[@class="doulist-item"]\') for i in range(25): book_value = node[i] yield { \'title\': book_value.xpath(\'normalize-space(.//div/div[2]/div[3]/a/text())\'), \'author\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[1])\'), \'year\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[3])\'), \'rank\': book_value.xpath(\'normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())\') } def write_to_file(content): with open(\'result.json\', \'a\', encoding=\'utf-8\') as f: print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False) + \'\n\') def main(start): url = \'https://www.douban.com/doulist/1264675/?start=0\' + str(start) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == \'__main__\': for i in range(0, 250, 25): main(start=i) time.sleep(1)
方法二:使用面向对象爬取
import time import json import requests from lxml import etree from requests.exceptions import RequestException class DouBanReadBook(): """豆瓣读书排行""" # 初始化属性 def __init__(self): self.url = \'https://www.douban.com/doulist/1264675/\' self.headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36\' } # 请求url获取html文本 def get_one_page(self, url): try: response = requests.get(url, headers=self.headers) if response.status_code == 200: return response.text return None except RequestException: return None def process_data(self, html): dom = etree.HTML(html) print(dom) node = dom.xpath(\'//div[@class="article"]/div[@class="doulist-item"]\') for i in range(25): book_value = node[i] yield { \'title\': book_value.xpath(\'normalize-space(.//div/div[2]/div[3]/a/text())\'), \'author\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[1])\'), \'year\': book_value.xpath(\'normalize-space(.//div/div[2]/div[5]/text()[3])\'), \'rank\': book_value.xpath(\'normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())\') } def save_file(self, content): with open(\'result.json\', \'a\', encoding=\'utf-8\') as f: print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False) + \'\n\') def main(self, start): url = \'https://www.douban.com/doulist/1264675/?start=0\' + str(start) html = self.get_one_page(url) for item in self.process_data(html): print(item) self.save_file(item) if __name__ == \'__main__\': read_book = DouBanReadBook() for i in range(0, 250, 25): # print(i) read_book.main(start=i) time.sleep(1)