Python 使用selenium抓取网页文本和下载音频
#!\usr\bin\env python # -*- coding: utf-8 -*- \'一个自动从https://podcast.duolingo.com/spanish中下载音频并且爬取文本的程序\' \'需要配置下载以下所需库,并且配置好webdriver.Chrome(),否则报错\' from selenium import webdriver import requests import re import os import shelve def mainProc(): \'主进程\' db = openDb() get_pages(db) get_episodes(db) db.close() def openDb(): \'打开data文件,如果当前路径不存在,则新建文件并初始化\' filename = "data.dat" if not os.path.exists(filename): db = shelve.open("data", writeback=True) db["pages"] = [] db["episodes"] = [] else: db = shelve.open("data", writeback=True) return db def get_pages(db): \'遍历获取所有页面的网址并保存到shelve文件中\' # 主页面 main = \'https://podcast.duolingo.com/spanish\' # 循环遍历获取所有页面的网址 # 第一页则为主页面,不需要在main末尾添加i #\'https://podcast.duolingo.com/spanish2\' 以此类推" # 如果页面没有在文件中存在,则尝试访问页面,如果200成功,写入文本 for i in range(1, 100): page = main if i == 1 else main + str(i) if not page in db["pages"]: r = requests.get(page) print(f\'{page} with status code {r.status_code}.\') if r.status_code != 200: break db["pages"].append(page) # 获取页面所有节目链接并补全连接 episodes = re.findall(\'entry-title">\s*<a href="(.*)" rel\', r.text) for episode in episodes: episode = str(main[:-7]) + str(episode[2:]) db["episodes"].append(episode) def get_episodes(db): \'在每一页中遍历所有的单集网址\' for episode in db["episodes"]: r = requests.get(episode) print(f\'{episode} with status code {r.status_code}.\') if r.status_code != 200: continue # 将页面的文本写入文件中并下载音频 get_transcript(episode) get_audios(r, episode) def get_transcript(episode): # 获取节目单集网址中的文本 filename = \'transcript/\' + episode.split(\'/\')[-1] + \'.txt\' if os.path.exists(filename): print(filename, \'existed!\') else: req = requests.get(episode) print(\'{episode} with status code {status}.\'.format(episode=episode, status=req.status_code)) if not os.path.exists(\'transcript\'): os.mkdir(\'transcript\') with open(filename, \'w+\', encoding="utf-8") as fp: for lines in re.findall(\'strong>(.*)</strong>(.*)</p>\', req.text): for line in lines: fp.write(line) fp.write(\'\n\n\') print(filename, \'added!\') def get_audios(r, episode): audio = "https:" + re.findall(\'<iframe .* src="(.*)" height\', r.text)[0] # 自定义下载配置 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("--ignore-certificate-errors") prefs = {"download.default_directory":r"E:\Python\code\project\duolingo\audio"} chromeOptions.add_experimental_option("prefs", prefs) # 下载文件 print(audio) browser = webdriver.Chrome(chrome_options=chromeOptions) browser.get(audio) if not os.path.exists("audio"): os.mkdir("audio") browser.find_element_by_id(\'download-player\').click() download_status = False while not download_status: download_status = True for i in os.listdir(\'audio\'): if i.endswith(".crdownload"): download_status = False time.sleep(5) browser.close() if __name__ == "__main__": mainProc()