GB标准文档爬虫下载程序

时间:2024-10-04 07:31:32
""" author:babyfengfjx """ import requests import re from time import sleep from bs4 import BeautifulSoup import shelve headers = { "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Cookie": "HMACCOUNT_BFESS=D1D8258E03E0A558; BDUSS_BFESS=dKNzJKaFhQQWFvcEpSZG9oRE5YR0Zod1l-VHE3ZVFLfnJTZWNJT3JKbGdiT3BsRVFBQUFBJCQAAAAAAAAAAAEAAAB~qcIBZmxvd2ludGhlcml2ZXIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGDfwmVg38JlMT; H_WISE_SIDS_BFESS=40008_40206_40212_40215_40080_40364_40352_40367_40374_40401_40311_40301_40467_40461_40471_40456_40317; BAIDUID_BFESS=A6E2AF276F85EFFB50804B65078FB44D:FG=1; ZFY=hyR2bKIUFoz76hVFPIVRUUHYScV4SOFL0yQP0ASJu4k:C", # "Host": "", "Referer": "/", # "Sec-Ch-Ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"\";v=\"99\"", # "Sec-Ch-Ua-Mobile": "?0", # "Sec-Ch-Ua-Platform": "\"Windows\"", # "Sec-Fetch-Dest": "image", # "Sec-Fetch-Mode": "no-cors", # "Sec-Fetch-Site": "cross-site", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0" } def getcontent_list(html): # /bz/index_927.html res = requests.get(html,headers=headers) res.encoding = 'GBK' html_content= res.text # print(html_content) repat = re.compile(r'<a href="(https.+?)".*?title="(.*?)">.*?</a>') repat1 = re.compile(r'<a href="(https.+?)".*?target="_blank">(.*?)</a>') result = repat.findall(html_content) result1 = repat1.findall(html_content) return result + result1 def downloadfiles(fileinfo): url = fileinfo[0] name = fileinfo[1] download_url_info = requests.get(url,headers=headers).text # print(download_url_info) repat = re.compile(r'<a href="(https.*?)" target="_blank" rel="nofollow" ' r'class="bz-down-button">在线预览</a>') download_url = repat.findall(download_url_info) name = name.replace('/','-').replace('∕','-').replace(':', '-').replace('*','-') # 这里因为/在windows下不能用作文件名,所以替换掉 return download_url[0],name def download_file(url,name): with shelve.open('download_list') as f: if url in f: # print(f'{name}》已经下载过') return print(f'{name}--开始下载》{url}') res = requests.get(url,headers=headers,stream=True) if res.status_code == 200: with open(f'{name}.pdf','wb') as f: for chunk in res.iter_content(chunk_size=16384): #16384就是16k f.write(chunk) print(f'{name}下载完成') with shelve.open('download_list') as f: f[url] = name else: print(f'{name}下载失败') if __name__ == '__main__': for base_page in range(45,928): htmlbase = f'/bz/index_{base_page}.html' print(f"当前访问页面:{htmlbase}") res = getcontent_list(htmlbase) # print(res) for i in res: # print(i) # if "软件" in i[1]: download_url,name= downloadfiles(i) download_file(download_url,name) sleep(1)

相关文章