python批量下载pdf

时间:2025-03-13 10:26:43
# coding = UTF-8 # 爬取2022美赛证书pdf import urllib.request import re import os # open the url and read def getHtml(url): page = urllib.request.urlopen(url) html = page.read() page.close() return html def getFile(url): file_name = url.split('/')[-1] try: u = urllib.request.urlopen(url) except urllib.error.HTTPError: # 碰到了匹配但不存在的文件时,提示并返回 print(url, "url file not found") return block_sz = 8192 with open(file_name, 'wb') as f: while True: buffer = u.read(block_sz) if buffer: f.write(buffer) else: break print("Sucessful to download" + " " + file_name) root_url = '/mcm/2022Certs/' # 下载地址中相同的部分 if not os.path.exists('pdf_download'): # 文件夹不存在时,再进行创建 os.mkdir('pdf_download') os.chdir(os.path.join(os.getcwd(), 'pdf_download')) for url in range(2202950,2202955): # 任意给出控制号的范围,也可以根据自己单位的队伍号建立一个list url = root_url + str(url) + '.pdf' # 形成完整的下载地址 getFile(url)