从文件中读取图片url和名称,将url中的文件下载下来。文件中每一行包含一个url和文件名,用制表符隔开。
1、使用requests请求url并下载文件
def download(img_url, img_name): with closing(requests.get(img_url, stream=True)) as r: with open(os.path.join(out_dir, img_name), 'wb') as f: for data in r.iter_content(1024): f.write(data)
2、从文件中读取url,考虑文件较大,使用生成器的方式读取。
def get_imgurl_generate(): with open('./example.txt', 'r') as f: for line in f: line = line.strip() yield imgs
3、使用多线程进行下载
lock = threading.Lock() def loop(imgs): while True: try: with lock: img_url, img_name = next(imgs) except StopIteration: break download_pic(img_url, img_name) img_gen = imgurl_generate() for i in range(0, thread_num): t = threading.Thread(target=loop, args=(img_gen,)) t.start()
完整代码,加入异常处理
1 # -*- coding: utf-8 -*- 2 import os 3 from contextlib import closing 4 import threading 5 import requests 6 import time 7 8 9 headers = { 10 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' 11 } 12 13 #输出文件夹 14 out_dir = './output' 15 #线程数 16 thread_num = 20 17 #http请求超时设置 18 timeout = 5 19 20 if not os.path.exists(out_dir): 21 os.mkdir(out_dir) 22 23 24 25 def download(img_url, img_name): 26 if os.path.isfile(os.path.join(out_dir, img_name)): 27 return 28 with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r: 29 rc = r.status_code 30 if 299 < rc or rc < 200: 31 print 'returnCode%s\t%s' % (rc, img_url) 32 return 33 content_length = int(r.headers.get('content-length', '0')) 34 if content_length == 0: 35 print 'size0\t%s' % img_url 36 return 37 try: 38 with open(os.path.join(out_dir, img_name), 'wb') as f: 39 for data in r.iter_content(1024): 40 f.write(data) 41 except: 42 print 'savefail\t%s' % img_url 43 44 def get_imgurl_generate(): 45 with open('./final.scp', 'r') as f: 46 index = 0 47 for line in f: 48 index += 1 49 if index % 500 == 0: 50 print 'execute %s line at %s' % (index, time.time()) 51 if not line: 52 print ur'line %s is empty "\t"' % index 53 continue 54 line = line.strip() 55 try: 56 imgs = line.split('\t') 57 if len(imgs) != 2: 58 print ur'line %s splite error' % index 59 continue 60 if not imgs[0] or not imgs[1]: 61 print ur'line %s img is empty' % index 62 continue 63 yield imgs 64 except: 65 print ur'line %s can not split by "\t"' % index 66 67 68 lock = threading.Lock() 69 def loop(imgs): 70 print 'thread %s is running...' % threading.current_thread().name 71 72 while True: 73 try: 74 with lock: 75 img_url, img_name = next(imgs) 76 except StopIteration: 77 break 78 try: 79 download(img_url, img_name) 80 except: 81 print 'exceptfail\t%s' % img_url 82 print 'thread %s is end...' % threading.current_thread().name 83 84 img_gen = get_imgurl_generate() 85 86 for i in range(0, thread_num): 87 t = threading.Thread(target=loop, name='LoopThread %s' % i, args=(img_gen,)) 88 t.start()