Python爬虫案例—飞卢小说爬取

时间:2025-03-21 13:42:04
#导入相关库
import requests
from bs4 import BeautifulSoup
#浏览器标识头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50"
}

# 获取该章节的内容
def get_fiction(url):               
    r = (url, headers=headers)
    soup = BeautifulSoup(, '')
    page = ('div', class_='noveContent').text
    return page


#获取小说链接、章节、名字
def get_links(index_url):            
    links =[]
    titles =[]
    r = (url = index_url, headers=headers)
    soup = BeautifulSoup(, '')
    page = soup.find_all('div', class_='c_con_li_detail_p')
    for a in page:
        link = ['href']  # 这是取href
        title = ['title']
        fic_url = "https:"+link
        (fic_url)
        (title)
    filename = ('div', class_='c_con_rl_title').text
    return links, titles, filename

#下载小说名字、章节名字、章节内容、章节顺序
def download(fic_url, title, filename, path):  
    content = get_fiction(fic_url)  # 这里直接调用的第一步的函数,再将传入的fic_url传入
    with open(f"{path}/{filename}.txt", 'a', encoding='utf-8') as f:
        (title + "\n\n")
        (content + "\n\n")
        ('--'*40 + "--")
        print(f'已下载{title}')
if __name__ == '__main__':  #  第三部,处理一些需要传入的信息
    path = r'd:小说'                   # 存储路径
    index_url = '/html_1217_1217177/'     #小说目录页
    links, titles, filename = get_links(index_url)          # 将目录页返回出来的东西拿出来
    for fic_url, title in zip(links, titles):         # 将链接和标题压缩,同时遍历两个列表
        download(fic_url, title, filename, path)

#小说未加密的爬取:

#1:提取小说的名字、章节的顺序、章节的名字、章节的内容

#2:下载小说,用合适的规则保存文章

#该页面在下载好库后,在d盘下建立一个小说  文件即可全部复制粘贴运用

提示!本文章仅供学习交流,严禁用于任何商业和非法用途,如有侵权,可联系本文作者删除!