原文来自:https://www.cnblogs.com/0bug/p/8893677.html
什么是Urllib?
Python内置的HTTP请求库
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块
相比Python的变化
Python2中的urllib2在Python3中被统一移动到了urllib.request中
python2
import urllib2
response = urllib2.urlopen('http://www.cnblogs.com/0bug')
Python3
import urllib.request
response = urllib.request.urlopen('http://www.cnblogs.com/0bug/')
urlopen()
不加data是以GET方式发送,加data是以POST发送
1
2
3
4
5
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' )
html = response.read().decode( 'utf-8' )
print(html) |
结果
加data发送POST请求
1
2
3
4
5
6
|
import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({ 'hello' : '0bug' }), encoding= 'utf-8' )
response = urllib.request.urlopen( 'http://httpbin.org/post' , data=data)
print(response.read()) |
timeout超时间
1
2
3
4
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' , timeout=0.01)
print(response.read()) |
1
2
3
4
5
6
7
8
|
import urllib.request import socket import urllib.error try :
response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' , timeout=0.01)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print( '请求超时' )
|
响应
1.响应类型
1
2
3
4
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' )
print(type(response)) |
2.状态码、响应头
1
2
3
4
5
6
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' )
print(response.status) print(response.getheaders()) print(response.getheader( 'Content-Type' ))
|
3.响应体
响应体是字节流,需要decode('utf-8')
1
2
3
4
5
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' )
html = response.read().decode( 'utf-8' )
print(html) |
Request
1
2
3
4
5
|
import urllib.request request = urllib.request.Request( 'http://www.cnblogs.com/0bug' )
response = urllib.request.urlopen(request) print(response.read().decode( 'utf-8' ))
|
添加请求头信息
1
2
3
4
5
6
7
8
9
10
11
12
|
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' ,
'Host' : 'httpbin.org'
} dic = { 'name' : '0bug' }
data = bytes(parse.urlencode(dic), encoding= 'utf-8' )
req = request.Request(url=url, data=data, headers=headers, method= 'POST' )
response = request.urlopen(req) print(response.read().decode( 'utf-8' ))
|
add_header
1
2
3
4
5
6
7
8
9
10
|
from urllib import request, parse
url = 'http://httpbin.org/post'
dic = { 'name' : '0bug' }
data = bytes(parse.urlencode(dic), encoding= 'utf-8' )
req = request.Request(url=url, data=data, method= 'POST' )
req.add_header( 'User-Agent' ,
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' )
response = request.urlopen(req) print(response.read().decode( 'utf-8' ))
|
Handler
代理:
1
2
3
4
5
6
7
8
9
|
import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http' : 'http代理' ,
'https' : 'https代理'
}) opener = urllib.request.build_opener(proxy_handler) response = opener.open( 'http://www.cnblogs.com/0bug' )
print(response.read()) |
Cookie
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' )
for item in cookie:
print(item.name + "=" + item.value)
|
Cookie保存为文件
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' )
cookie.save(ignore_discard=True, ignore_expires=True) |
cookie.txt
另一种方式存
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' )
cookie.save(ignore_discard=True, ignore_expires=True) |
用什么格式的存就应该用什么格式的读
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load( 'cookie.txt' , ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' )
print(response.read().decode( 'utf-8' ))
|
异常处理
1
2
3
4
5
6
|
from urllib import request, error
try :
response = request.urlopen( 'http://www.cnblogs.com/0bug/xxxx' )
except error.URLError as e:
print(e.reason)
|
1
2
3
4
5
6
7
8
9
10
|
from urllib import request, error
try :
response = request.urlopen( 'http://www.cnblogs.com/0bug/xxxx' )
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep= '\n' )
except error.URLError as e:
print(e.reason)
else :
print( 'Request Successfully' )
|
1
2
3
4
5
6
7
8
9
10
|
import socket import urllib.request import urllib.error try :
response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug/xxxx' , timeout=0.001)
except urllib.error.URLError as e:
print(type(e.reason))
if isinstance(e.reason, socket.timeout):
print( '请求超时' )
|
URL解析
1
2
3
4
5
|
from urllib.parse import urlparse
result = urlparse( 'www.baidu.com/index.html;user?id=5#comment' )
print(type(result)) print(result) |
1
2
3
4
|
from urllib.parse import urlparse
result = urlparse( 'www.baidu.com/index.html;user?id=5#comment' , scheme= 'https' )
print(result) |
1
2
3
4
|
from urllib.parse import urlparse
result = urlparse( 'http://www.baidu.com/index.html;user?id=5#comment' , scheme= 'https' )
print(result) |
1
2
3
4
|
from urllib.parse import urlparse
result = urlparse( 'http://www.badiu.com/index.html;user?id=5#comment' , allow_fragments=False)
print(result) |
1
2
3
4
|
from urllib.parse import urlparse
result = urlparse( 'http://www.badiu.com/index.html#comment' , allow_fragments=False)
print(result) |
urlunparse
1
2
3
4
|
from urllib.parse import urlunparse
data = [ 'http' , 'www.baidu.com' , 'index.html' , 'user' , 'id=6' , 'comment' ]
print(urlunparse(data)) |
urljoin
1
2
3
4
5
6
7
8
9
10
|
from urllib.parse import urljoin
print(urljoin( 'http://www.baidu.com' , 'ABC.html' ))
print(urljoin( 'http://www.baidu.com' , 'https://www.cnblogs.com/0bug' ))
print(urljoin( 'http://www.baidu.com/0bug' , 'https://www.cnblogs.com/0bug' ))
print(urljoin( 'http://www.baidu.com/0bug' , 'https://www.cnblogs.com/0bug?q=2' ))
print(urljoin( 'http://www.baidu.com/0bug?q=2' , 'https://www.cnblogs.com/0bug' ))
print(urljoin( 'http://www.baidu.com' , '?q=2#comment' ))
print(urljoin( 'www.baidu.com' , '?q=2#comment' ))
print(urljoin( 'www.baidu.com#comment' , '?q=2' ))
|
urlencode
1
2
3
4
5
6
7
8
9
|
from urllib.parse import urlencode
params = {
'name' : '0bug' ,
'age' : 25
} base_url = 'http://www.badiu.com?'
url = base_url + urlencode( params )
print(url) |