Python -- 网络编程 -- 抓取网页图片 -- 豆瓣妹子

时间:2023-01-07 08:24:36

首先分析页面URL,形如http://dbmeizi.com/category/[1-14]?p=[0-476]

图片种类对应编号

1:'性感', 2:'有沟', 3:'美腿', 4:'小露点',
6:'所有男', 7:'肌肉男', 8:'清新男', 9:'有意思' ,
10:'所有', 11:'小清新', 12:'文艺', 13:'文艺男', 14:'美臀'

图片地址形如data-bigimg="http://pic.dbmeizi.com/pics/nn2nn2nn/p12378370.jpg"

-----源代码meizi.py-----

 import re, os, time
 import urllib.request

 def getHtml(url):#取得网页的html纯文本
     return urllib.request.urlopen(url).read().decode('utf-8')

 def download(url, filename):#将文件下载到本地
     urllib.request.urlretrieve(url, filename)

 if __name__ == '__main__':
     print('---豆瓣妹子抓图机---')
     dic = {1:'性感', 2:'有沟', 3:'美腿', 4:'小露点',
            6:'所有男', 7:'肌肉男', 8:'清新男', 9:'有意思' ,
            10:'所有', 11:'小清新', 12:'文艺', 13:'文艺男', 14:'美臀'}
     for i in dic.keys():
         print('{:<15}'.format(str(i)+'--'+dic[i]), end='')
         if i%4==0: print()
     category = int(input('\n请输入抓取类别:'))
     pageNo1 = int(input('请输入抓取页面起始编号(0-476):'))#2014.5.5正好476页
     pageNo2 = int(input('请输入抓取页面终止编号(0-476):'))
     for no in range(pageNo1, pageNo2+1):
         url = 'http://dbmeizi.com/category/{}?p={}'.format(category, no)
         html = getHtml(url)
         reMeizi = r'(?<=bigimg=").+jpg'
         pics = re.findall(reMeizi, html)
         folder = 'D:/DBMeizi/{}/{}/'.format(dic[category], no)
         if not os.path.exists(folder):
             os.makedirs(folder)
         logfile = open(folder+'log.txt', 'wt')
         logfile.write('图片来源:'+ url +'\n图片链接:\n')
         for pic in pics:
             print('正在下载', pic)
             try:
                 download(pic, folder+pic[-13:])
             except:
                 print('下载出错')
                 logfile.write(pic + ' 下载出错!\n')
                 continue
             logfile.write(pic+'\n')
         logfile.close()
         print('下载' + dic[category] +'['+ str(no) +']结束。')
         time.sleep(1)
     print('全部任务结束。')
     

Python -- 网络编程 -- 抓取网页图片 -- 豆瓣妹子

Python -- 网络编程 -- 抓取网页图片 -- 豆瓣妹子