Python下载PDF

时间:2024-10-12 07:18:54
from bs4 import BeautifulSoup import requests import io from urllib import parse import time # "/2021gongshi/6jiaoxue33/教授(12)/赵建国 职称评审简表.pdf" def get_file_url(url_website): data = requests.get(url_website) data.encoding = 'gbk' data.encoding = 'utf-8' print(data.text) soup = BeautifulSoup(data.text, '') # 文档对象 pdfURLs = [] # 查找a标签,只会查找出一个a标签 for k in soup.find_all('a'): # 此步骤是要正确拼接URL,将汉字转换成百分号形式,并按照"//"进行split,生成列表 href = 'http://'+parse.quote