python Requsets下载开源网站的代码(带索引数据)

环境搭建

python 3.x
requests 包
re 包
gooey包（用于可视化）

代码

				?

									import requests

									import re

									import os

									from gooey import Gooey, GooeyParser

									import time

									s = requests.Session()

									def judgeTypeOfPath(name):

									    '''

									    判断该路径是文件还是文件夹

									      :param name: 路径名称

									      :return:True->文件;False->文件夹

									    '''

									    if name[-1] == '/':

									        return False

									    else:

									        return True

									def makeDirOfPath(path):

									    '''

									    创建文件夹

									    :param path: 文件夹名称以及路径

									    :return: True->创建成功;False->创建失败

									    '''

									    if not os.path.isdir(path):

									        os.mkdir(path)

									    if not os.path.isdir(path):

									        return False

									    return True

									def getPath(url):

									    '''

									    获取网页路径列表

									    :param url: 当前网页路径

									    :return: 路径列表

									    '''

									    baseResponse = s.get(url=url, stream=True,verify=False).text

									    listOfDirOrFilesTemp = re.findall(r'<li><a href=".*?" rel="external nofollow" >', baseResponse)

									    listOfDirOrFiles = []

									    for i in range(len(listOfDirOrFilesTemp)):

									        listOfDirOrFiles.append(listOfDirOrFilesTemp[i].split("\"")[1])

									    return listOfDirOrFiles[1:len(listOfDirOrFiles) + 1]

									def rfSearch(listOfPath,url, nowPath):

									    '''

									    递归寻找目录、路径,并下载文件

									    :param listOfPath: 当前目录下文件以及文件夹目录列表

									    :param nowPath: 现在所在路径

									    :return:

									    '''

									    newList = listOfPath[:]

									    if not newList:

									        return

									    for i in range(len(newList)):

									        if not judgeTypeOfPath(newList[i]):

									            u = nowPath + newList[i][0:len(newList[i])]

									            makeDirOfPath(u)

									            tempPath=nowPath + newList[i][0:len(newList[i])+1]

									            tempUrl=url+newList[i][0:len(newList[i])+1]

									            u=getPath(tempUrl)

									            rfSearch(u,tempUrl,tempPath)

									        else:

									            print(f'开始下载{newList[i]}...')

									            t1=time.time()

									            u = nowPath + newList[i]

									            m=url+newList[i]

									            if not os.path.exists(u):

									                r = s.get(m, stream=True,verify=False)

									                f = open(u, "wb")

									                for chunk in r.iter_content(chunk_size=10240):

									                    if chunk:

									                        f.write(chunk)

									                f.close()

									            t2=time.time()

									            print(f'{newList[i]}下载完成\t\t用时  {t2-t1}')

									@Gooey(

									    program_name='isric数据下载器',

									    encoding="utf-8", )

									def main():

									    parser = GooeyParser(description="isric数据下载器")

									    parser.add_argument('--url',default=r'https://files.isric.org/soilgrids/latest/data/')

									    parser.add_argument('--path', widget="DirChooser", default=r'F:/isricData/')

									    args = parser.parse_args()

									    url=args.url

									    nowPath = args.path

									    u = getPath(url)

									    rfSearch(u, url,nowPath)

									###如果不需要可视化，则不用gooey，可以将上面部分替换如下

									#@Gooey(

									#    program_name='isric数据下载器',

									#   encoding="utf-8", )

									#上面三行删除即可

									###main函数替换成下面部分：

									# def main():

									#     url=r'https://files.isric.org/soilgrids/latest/data/'#在此处修改地址链接

									#     nowPath = r'F:/isricData/'#在此处修改文件保存地址

									#     u = getPath(url)

									#     rfSearch(u, url,nowPath)

									if __name__ == "__main__":

									    main()

到此这篇关于python Requsets下载开源网站的代码(带索引数据)的文章就介绍到这了,更多相关python Requsets下载内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家！

原文链接：https://blog.csdn.net/qq_39632866/article/details/115642977

python Requsets下载开源网站的代码(带索引 数据)

相关文章

python Requsets下载开源网站的代码(带索引数据)