python爬虫之urllib

时间:2023-03-09 16:42:29
python爬虫之urllib
#coding=utf-8
#urllib操作类 import time
import urllib.request
import urllib.parse
from urllib.error import HTTPError, URLError
import sys
class myUrllib: @staticmethod
def get_headers(headers):
default_headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
#'Referer': r'http://www.baidu.com/',
'Connection': 'keep-alive',
'Cookie':'uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308'
}
headers = headers and dict(default_headers,**headers) or default_headers
return headers @staticmethod
def get(url,headers={}):
headers = myUrllib.get_headers(headers)
#data=urllib.parse.urlencode(query_data).encode('utf-8')
#r/R:非转义的原始字符串
#u/U:表示unicode字符串
#b:bytes
url=r'%s'%url
request = urllib.request.Request(url,headers=headers,method='GET')
try:
html = urllib.request.urlopen(request).read()
page = html.decode('utf-8')
except HTTPError as e:
print (e.code,e.reason)
except URLError as e:
print (e.reason)
return page @staticmethod
def post(url,data={},headers={}):
headers = myUrllib.get_headers(headers)
data=urllib.parse.urlencode(data)
binary_data=data.encode('utf-8')
url=r'%s'%url
request=urllib.request.Request(url,data=binary_data,headers=headers,method='POST')#发送请求,传送表单数据
# response=urllib.request.urlopen(request)#接受反馈的信息
# data=response.read()#读取反馈信息
# data=data.decode('utf-8')
#print (data.encode('gb18030'))
#print (response.geturl())#返回获取的真实的URL
#info():返回一个对象,表示远程服务器返回的头信息。
#getcode():返回Http状态码,如果是http请求,200表示请求成功完成;404表示网址未找到。
#geturl():返回请求的url地址。 try:
html = urllib.request.urlopen(request).read()
page = html.decode('utf-8')
except HTTPError as e:
print (e.code,e.reason)
except URLError as e:
print (e.reason)
return page getInfo = myUrllib.get('http://localhost:88/test/c.php?act=category',{'Referer': r'https://www.baidu.com/'})
print(getInfo) sys.exit() postInfo = myUrllib.post('http://localhost:88/test/c.php',{'id':1010},{'Referer': r'https://www.baidu.com/'})
print(postInfo)

d:\python\crawler>python urllib01.py

HTTP_HOST:

 localhost:88

HTTP_USER_AGENT:

 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)

 Chrome/63.0.3239.108 Safari/537.36

HTTP_COOKIE:

 uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308

HTTP_REFERER:

 https://www.baidu.com/

REQUEST_METHOD:

 GET

GET DATA:

array(1) {

  ["act"]=>

  string(8) "category"

}

#设置代理

#coding=utf-8
import urllib.request
import random
from urllib.error import HTTPError, URLError def proxy_handler(url,iplist,wfile):
#ip = random.choice(iplist)
for ip in iplist:
try:
print('*'*20,'\n ip:',ip)
proxy_support = urllib.request.ProxyHandler({'http':ip})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent',r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
code = response.getcode()
url = response.geturl()
print('*'*20,'\n url:',url)
print('*'*20,'\n code:',code)
info = response.info()
print('*'*20,'\n info:',info)
if code == 200:
page = response.read()
#写入文件
page = str(page, encoding='utf-8')
fw = open(wfile,'w',encoding='UTF-8')
fw.write(page)
fw.close()
print('*'*20,'\n write file:',wfile)
break
except HTTPError as e:
print (e.code,e.reason)
continue
except URLError as e:
print (e.reason)
continue url = r'http://ip.chinaz.com/'
iplist = ['182.42.244.169:808','122.72.18.34:80','52.44.16.168:3129']
wfile = 'page.txt'
proxy_handler(url,iplist,wfile)

d:\python\crawler>python proxy01.py

********************

 ip: 182.42.244.169:808

[WinError 10061] 由于目标计算机积极拒绝,无法连接。

********************

 ip: 122.72.18.34:80

********************

 url: http://ip.chinaz.com/

********************

 code: 200

********************

 info: Cache-Control: private

Content-Length: 33900

Content-Type: text/html; charset=utf-8

Server: Microsoft-IIS/7.5

X-AspNet-Version: 4.0.30319

Set-Cookie: qHistory=aHR0cDovL2lwLmNoaW5hei5jb20rSVAv5pyN5Yqh5Zmo5Zyw5Z2A5p+l6K

i; domain=.chinaz.com; expires=Tue, 05-Feb-2019 15:03:42 GMT; path=/

X-Powered-By: ASP.NET

Date: Mon, 05 Feb 2018 15:03:42 GMT

X-Cache: MISS from GD-SZ-WEB-01

X-Cache-Lookup: MISS from GD-SZ-WEB-01:80

Connection: close

********************

 write file: page.txt