应用场景:
状态不是200的URL重试多次
代码比较简单还有部分注释
python2.7实现:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# -*-coding:utf-8-*-
"""
ayou
"""
import requests
def url_retry(url,num_retries = 3 ):
print ( "access!" )
try :
request = requests.get(url,timeout = 60 )
#raise_for_status(),如果不是200会抛出HTTPError错误
request.raise_for_status()
html = request.content
except requests.HTTPError as e:
html = None
if num_retries> 0 :
#如果不是200就重试,每次递减重试次数
return url_retry(url,num_retries - 1 )
#如果url不存在会抛出ConnectionError错误,这个情况不做重试
except requests.exceptions.ConnectionError as e:
return
return html
url_retry( "http://httpbin.org/status/404" )
|
python3.5实现:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
# -*-coding:utf-8-*-
"""
ayou
"""
import asyncio
import aiohttp
async def print_page(url,num_retries = 3 ):
async with aiohttp.ClientSession() as session:
try :
async with session.get(url,timeout = 60 ) as response:
print ( "access!" )
#raise_for_status(),如果不是200会抛出HttpProcessingError错误
response.raise_for_status()
body = await response.text()
except aiohttp.errors.HttpProcessingError as e:
body = None
if num_retries > 0 :
#如果不是200就重试,每次递减重试次数
return await print_page(url, num_retries - 1 )
#不存在URL会抛出ClientResponseError错误
except aiohttp.errors.ClientResponseError as e:
return e
session.close()
print (body)
return body
def main():
#这是一个不存在URL
# url = 'http://httpbin.org/status/404111'
#这是一个404的URL
url = 'http://httpbin.org/status/404'
loop = asyncio.get_event_loop()
loop.run_until_complete(print_page(url))
loop.close()
if __name__ = = '__main__' :
main()
|
爬虫URL重试机制封装成修饰器(python2.7以及python3.5以上)
python2.7版本:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# -*-coding:utf-8-*-
"""
ayou
"""
import requests
#定义一个重试修饰器,默认重试一次
def retry(num_retries = 1 ):
#用来接收函数
def wrapper(func):
#用来接收函数的参数
def wrapper( * args, * * kwargs):
#为了方便看抛出什么错误定义一个错误变量
last_exception = None
#循环执行包装的函数
for _ in range (num_retries):
try :
#如果没有错误就返回包装的函数,这样跳出循环
return func( * args, * * kwargs)
except Exception as e:
#捕捉到错误不要return,不然就不会循环了
last_exception = e
#如果要看抛出错误就可以抛出
# raise last_exception
return wrapper
return wrapper
if __name__ = = "__main__" :
@retry ( 5 )
def url_retry(url):
request = requests.get(url, timeout = 60 )
print ( "access!" )
request.raise_for_status()
html = request.content
print (html)
return html
url_retry( "http://httpbin.org/status/404" )
# url_retry("http://httpbin.org/status/404111")
# url_retry("http://www.baidu.com")
|
python3.5以上版本:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
# -*-coding:utf-8-*-
"""
ayou
"""
import aiohttp,asyncio
#定义一个重试修饰器,默认重试一次
def retry(num_retries = 1 ):
#用来接收函数
def wrapper(func):
#用来接收函数的参数
def wrapper( * args, * * kwargs):
#为了方便看抛出什么错误定义一个错误变量
last_exception = None
#循环执行包装的函数
for _ in range (num_retries):
try :
#如果没有错误就返回包装的函数,这样跳出循环
return func( * args, * * kwargs)
except Exception as e:
#捕捉到错误不要return,不然就不会循环了
last_exception = e
#如果要看抛出错误就可以抛出
# raise last_exception
return wrapper
return wrapper
async def print_page(url):
async with aiohttp.ClientSession() as session:
async with session.get(url,timeout = 60 ) as response:
print ( "access!" )
#raise_for_status(),如果不是200会抛出HttpProcessingError错误
response.raise_for_status()
body = await response.text()
session.close()
print (body)
return body
@retry ( 5 )
def loop_get():
# url = "http://www.baidu.com"
# url = 'http://httpbin.org/status/404111'
url = 'http://httpbin.org/status/404'
loop = asyncio.get_event_loop()
loop.run_until_complete(print_page(url))
loop.close()
if __name__ = = '__main__' :
loop_get()
|
以上这篇python爬虫URL重试机制的实现方法(python2.7以及python3.5)就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/u013055678/article/details/54290481