Python下载PDF
from bs4 import BeautifulSoup
import requests
import io
from urllib import parse
import time
# "/2021gongshi/6jiaoxue33/教授(12)/赵建国 职称评审简表.pdf"
def get_file_url(url_website):
data = requests.get(url_website)
data.encoding = 'gbk'
data.encoding = 'utf-8'
print(data.text)
soup = BeautifulSoup(data.text, '') # 文档对象
pdfURLs = []
# 查找a标签,只会查找出一个a标签
for k in soup.find_all('a'):
# 此步骤是要正确拼接URL,将汉字转换成百分号形式,并按照"//"进行split,生成列表
href = 'http://'+parse.quote