Python 使用selenium抓取网页文本和下载音频

时间:2024-02-29 16:15:18

Python 使用selenium抓取网页文本和下载音频

#!\usr\bin\env python
# -*- coding: utf-8 -*-

\'一个自动从https://podcast.duolingo.com/spanish中下载音频并且爬取文本的程序\'
\'需要配置下载以下所需库,并且配置好webdriver.Chrome(),否则报错\'

from selenium import webdriver
import requests
import re
import os
import shelve
                   
def mainProc():
    \'主进程\'
    db = openDb()
    get_pages(db)
    get_episodes(db)
    db.close() 

def openDb():
    \'打开data文件,如果当前路径不存在,则新建文件并初始化\'
    filename = "data.dat"
    if not os.path.exists(filename):
        db = shelve.open("data", writeback=True)
        db["pages"] = []
        db["episodes"] = []
    else:
        db = shelve.open("data", writeback=True)

    return db

def get_pages(db):
    \'遍历获取所有页面的网址并保存到shelve文件中\'
    # 主页面
    main = \'https://podcast.duolingo.com/spanish\'  

    # 循环遍历获取所有页面的网址
    # 第一页则为主页面,不需要在main末尾添加i
    #\'https://podcast.duolingo.com/spanish2\' 以此类推"
    # 如果页面没有在文件中存在,则尝试访问页面,如果200成功,写入文本
    
    for i in range(1, 100):   
        page = main if i == 1 else main + str(i)
        if not page in db["pages"]:                                 
            r = requests.get(page)
            print(f\'{page} with status code {r.status_code}.\') 
            if r.status_code != 200:                                
                break 
            db["pages"].append(page)
            # 获取页面所有节目链接并补全连接
            episodes = re.findall(\'entry-title">\s*<a href="(.*)" rel\', r.text)
            for episode in episodes:
                episode = str(main[:-7]) + str(episode[2:])
                db["episodes"].append(episode)
                        
def get_episodes(db):
    \'在每一页中遍历所有的单集网址\'
    for episode in db["episodes"]:
        r = requests.get(episode)
        print(f\'{episode} with status code {r.status_code}.\') 
        if r.status_code != 200:
            continue
        # 将页面的文本写入文件中并下载音频
        get_transcript(episode)
        get_audios(r, episode)

def get_transcript(episode):
    # 获取节目单集网址中的文本
    filename = \'transcript/\' + episode.split(\'/\')[-1] + \'.txt\'
    if os.path.exists(filename):
        print(filename, \'existed!\')
    else:
        req = requests.get(episode)
        print(\'{episode} with status code {status}.\'.format(episode=episode, status=req.status_code))
        if not os.path.exists(\'transcript\'):
            os.mkdir(\'transcript\')
        with open(filename, \'w+\', encoding="utf-8") as fp:
            for lines in re.findall(\'strong>(.*)</strong>(.*)</p>\', req.text):
                for line in lines:
                    fp.write(line)
                fp.write(\'\n\n\')
            print(filename, \'added!\')

def get_audios(r, episode):
    audio = "https:" + re.findall(\'<iframe .* src="(.*)" height\', r.text)[0]
    # 自定义下载配置
    chromeOptions = webdriver.ChromeOptions()
    chromeOptions.add_argument("--ignore-certificate-errors")
    prefs = {"download.default_directory":r"E:\Python\code\project\duolingo\audio"}
    chromeOptions.add_experimental_option("prefs", prefs)
    # 下载文件
    print(audio)
    browser = webdriver.Chrome(chrome_options=chromeOptions)
    browser.get(audio)
    if not os.path.exists("audio"):
        os.mkdir("audio")
    browser.find_element_by_id(\'download-player\').click()
    download_status = False
    while not download_status:
        download_status = True
        for i in os.listdir(\'audio\'):
            if i.endswith(".crdownload"):
                download_status = False
                time.sleep(5)
    browser.close()

if __name__ == "__main__":
    mainProc()