坑一:
https的加密方式是TLSv1.2,这个导致很多以前的爬虫都不能用了,解决方法是用python 的ssl库代码如下:
参考的文章:http://www.cnblogs.com/vingi/articles/4131633.html
代码:
# -*- coding: utf-8 -*-
import httplib,ssl, urllib2, socket
class HTTPSConnectionV3(httplib.HTTPSConnection):
def __init__(self,*args,**kwargs):
httplib.HTTPSConnection.__init__(self,*args,**kwargs)
def connect(self):
sock= socket.create_connection((self.host,self.port),self.timeout)
if self._tunnel_host:
self.sock= sock
self._tunnel()
try:
self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_TLSv1)
except ssl.SSLError,e:
print("TryingSSLv3.")
self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_SSLv23)
class HTTPSHandlerV3(urllib2.HTTPSHandler):
def https_open(self,req):
return self.do_open(HTTPSConnectionV3,req)
#安装证书
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))
if __name__== "__main__":
url="https://www.zoomeye.org/"
heads={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0"}
try:
r=urllib2.Request(url,headers=heads)
resual= urllib2.urlopen(r)
print resual.read()
except urllib2.URLError, e:
print e.read()
输出的是:
<script>var dc="";var t_d={hello:"world",t_c:function(x){if(x==="")return;if(x.slice(-1)===";"){x=x+" ";};if(x.slice(-2)!=="; "){x=x+"; ";};dc=dc+x;}};(function(a){eval(function(p,a,c,k,e,d){e=function(c){return(c<a?"":e(parseInt(c/a)))+((c=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p;}('b d=[3,4,1,2,0];b o=[];b p=0;g(b i=0;i<d.f;i++){d[i]=a[d[i]]}d=d.j(\'\').l(\';\');g(b i=0;i<d.f;i++){h.m(d[i])}n("e.c=e.c.q(/[\\?|&]s-r/, \'\')",k);',29,29,'|||||||||||var|href||location|length|for|t_d||join|1500|split|t_c|setTimeout|||replace|challenge|captcha'.split('|'),0,{}));})(['14-Feb-16 04:46:09 GMT;Path=/;', 'XHhxKtGuWK1VUCL37XI%3D;Exp', 'ires=Sun, ', '__jsl_clearance=145542', '1569.853|0|kkQeowPJ']);document.cookie=dc;</script>
坑二:
虽然安装了正确的证书,但是还是不能拿到正常的网页,通过抓包,分析包,我发现必须带上正确的cookie才能请求到正常的数据
如何才能拿到正确的cookie呢?这里用到一个python的webkit:Ghost.py
参考的文章:
http://pyqt.sourceforge.net/Docs/PyQt4/qnetworkcookie.html#parseCookies
http://jeanphix.me/Ghost.py/
http://ghost-py.readthedocs.org/en/latest/
https://github.com/jeanphix/Ghost.py
代码:
import ghost
g = ghost.Ghost()
with g.start() as session:
head={"Referer": "https://www.zoomeye.org/"}
page, extra_resources = session.open("https://www.zoomeye.org/",method='get', headers=head,wait=True,encode_url=True, user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0")
for element in session.cookies:
print element.toRawForm()
输出:
__jsluid=a94170e7f36908f508f4bdbfdcb69461; domain=www.zoomeye.org; path=/
__jsl_clearance=1455422745.375|0|ZTSyyuKTql2jEPtDBqdlp1xe1d8%3D; expires=Sun, 14-Feb-2016 05:05:45 GMT; domain=www.zoomeye.org; path=/
最后将两段代码整合一下就能得到正常的网页数据了
代码:
# -*- coding: utf-8 -*-
import httplib,ssl, urllib2, socket, ghost
from bs4 import BeautifulSoup
#https证书安装类
class HTTPSConnectionV3(httplib.HTTPSConnection):
def __init__(self,*args,**kwargs):
httplib.HTTPSConnection.__init__(self,*args,**kwargs)
def connect(self):
sock= socket.create_connection((self.host,self.port),self.timeout)
if self._tunnel_host:
self.sock= sock
self._tunnel()
try:
self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_TLSv1) #选择证书类型
except ssl.SSLError,e:
print("TryingSSLv3.")
self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_SSLv23)
class HTTPSHandlerV3(urllib2.HTTPSHandler):
def https_open(self,req):
return self.do_open(HTTPSConnectionV3,req)
#安装证书
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))
#获取zoomeye的cookie
def getcookie():
g = ghost.Ghost()
with g.start() as session:
mycookielist=[]
head={"Referer": "https://www.zoomeye.org/"}
page, extra_resources = session.open("https://www.zoomeye.org/",method='get', headers=head,wait=True,encode_url=True, user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0")
for element in session.cookies:
mycookielist.append(element.toRawForm().split(";"))
cookiestr= mycookielist[0][0]+";"+mycookielist[1][0]
return cookiestr
#发送http请求
def sendhttp():
url="https://www.zoomeye.org/"
heads={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0"}
#heads["Cookie"]="__jsluid=260219e8b0e3b3574a2e28d15e8249ff;__jsl_clearance=1455352838.397|0|uSSCfF4umEIhqMz8GpcSDVBL9MI%3D"
#heads["Cookie"]="__jsluid=9222a6c2cf192a864941182010a25d9b;__jsl_clearance=1455356240.603|0|50xi3gYoENuTPXaeWIhLW%2F00C2Y%3D;"
heads["Cookie"]=getcookie()
try:
r=urllib2.Request(url,headers=heads)
resual= urllib2.urlopen(r)
return resual.read()
except urllib2.URLError, e:
print e.read()
#主程序
if __name__== "__main__":
htmlstr =sendhttp()
print htmlstr