核心思路是使用正则表达式对网页的html5中的路径名和文件名进行抓取,
然后对路径继续进行同样的抓取,用递归的方式进行搜索。最后把网站上的内容文件全部下载下来
import urllib
import sysimport BeautifulSoup
import re
import os
path = []
def extract(url):
content = urllib.urlopen(url).read()
#reg = r'(?:href|HREF)="?((?:http://)?.+?\.txt)'
reg = r'<a href="(.*)">.*'
url_re = re.compile(reg)
url_lst = re.findall(url_re, content)
for lst in url_lst:
ext = lst.split('.')[-1]
if ext[-1] == '/':
newUrl = url + lst
extract(newUrl)
else:
path.append(url + lst)
print "downloading with urllib"
url = 'http://139.196.233.65/js/'
extract(url)
filePath = 'E:/6-学习文档/91-JS/Download/js'
filePath = unicode(filePath, 'utf8')
for p in path:
fileTitle = p.split('/js')[-1]
file = filePath + fileTitle
dir = os.path.dirname(file)
isExists=os.path.exists(dir)
if isExists == False:
os.makedirs(dir)
urllib.urlretrieve(p, file)
#for lst in url_lst:
# file = filePath + lst
# lst = url + '/' + lst
# urllib.urlretrieve(lst, file)