首先分析页面URL,形如http://dbmeizi.com/category/[1-14]?p=[0-476]
图片种类对应编号:
1:'性感', 2:'有沟', 3:'美腿', 4:'小露点',
6:'所有男', 7:'肌肉男', 8:'清新男', 9:'有意思' ,
10:'所有', 11:'小清新', 12:'文艺', 13:'文艺男', 14:'美臀'
图片地址形如data-bigimg="http://pic.dbmeizi.com/pics/nn2nn2nn/p12378370.jpg"
-----源代码meizi.py-----
import re, os, time import urllib.request def getHtml(url):#取得网页的html纯文本 return urllib.request.urlopen(url).read().decode('utf-8') def download(url, filename):#将文件下载到本地 urllib.request.urlretrieve(url, filename) if __name__ == '__main__': print('---豆瓣妹子抓图机---') dic = {1:'性感', 2:'有沟', 3:'美腿', 4:'小露点', 6:'所有男', 7:'肌肉男', 8:'清新男', 9:'有意思' , 10:'所有', 11:'小清新', 12:'文艺', 13:'文艺男', 14:'美臀'} for i in dic.keys(): print('{:<15}'.format(str(i)+'--'+dic[i]), end='') if i%4==0: print() category = int(input('\n请输入抓取类别:')) pageNo1 = int(input('请输入抓取页面起始编号(0-476):'))#2014.5.5正好476页 pageNo2 = int(input('请输入抓取页面终止编号(0-476):')) for no in range(pageNo1, pageNo2+1): url = 'http://dbmeizi.com/category/{}?p={}'.format(category, no) html = getHtml(url) reMeizi = r'(?<=bigimg=").+jpg' pics = re.findall(reMeizi, html) folder = 'D:/DBMeizi/{}/{}/'.format(dic[category], no) if not os.path.exists(folder): os.makedirs(folder) logfile = open(folder+'log.txt', 'wt') logfile.write('图片来源:'+ url +'\n图片链接:\n') for pic in pics: print('正在下载', pic) try: download(pic, folder+pic[-13:]) except: print('下载出错') logfile.write(pic + ' 下载出错!\n') continue logfile.write(pic+'\n') logfile.close() print('下载' + dic[category] +'['+ str(no) +']结束。') time.sleep(1) print('全部任务结束。')