坑一:
https的加密方式是TLSv1.2,这个导致很多以前的爬虫都不能用了,解决方法是用python 的ssl库代码如下:
参考的文章:http://www.cnblogs.com/vingi/articles/4131633.html
代码:
# -*- coding: utf-8 -*- import httplib,ssl, urllib2, socket class HTTPSConnectionV3(httplib.HTTPSConnection): def __init__(self,*args,**kwargs): httplib.HTTPSConnection.__init__(self,*args,**kwargs) def connect(self): sock= socket.create_connection((self.host,self.port),self.timeout) if self._tunnel_host: self.sock= sock self._tunnel() try: self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_TLSv1) except ssl.SSLError,e: print("TryingSSLv3.") self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_SSLv23) class HTTPSHandlerV3(urllib2.HTTPSHandler): def https_open(self,req): return self.do_open(HTTPSConnectionV3,req) #安装证书 urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3())) if __name__== "__main__": url="https://www.zoomeye.org/" heads={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0"} try: r=urllib2.Request(url,headers=heads) resual= urllib2.urlopen(r) print resual.read() except urllib2.URLError, e: print e.read()
输出的是:
<script>var dc="";var t_d={hello:"world",t_c:function(x){if(x==="")return;if(x.slice(-1)===";"){x=x+" ";};if(x.slice(-2)!=="; "){x=x+"; ";};dc=dc+x;}};(function(a){eval(function(p,a,c,k,e,d){e=function(c){return(c<a?"":e(parseInt(c/a)))+((c=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p;}('b d=[3,4,1,2,0];b o=[];b p=0;g(b i=0;i<d.f;i++){d[i]=a[d[i]]}d=d.j(\'\').l(\';\');g(b i=0;i<d.f;i++){h.m(d[i])}n("e.c=e.c.q(/[\\?|&]s-r/, \'\')",k);',29,29,'|||||||||||var|href||location|length|for|t_d||join|1500|split|t_c|setTimeout|||replace|challenge|captcha'.split('|'),0,{}));})(['14-Feb-16 04:46:09 GMT;Path=/;', 'XHhxKtGuWK1VUCL37XI%3D;Exp', 'ires=Sun, ', '__jsl_clearance=145542', '1569.853|0|kkQeowPJ']);document.cookie=dc;</script>
坑二:
虽然安装了正确的证书,但是还是不能拿到正常的网页,通过抓包,分析包,我发现必须带上正确的cookie才能请求到正常的数据
如何才能拿到正确的cookie呢?这里用到一个python的webkit:Ghost.py
参考的文章:
http://pyqt.sourceforge.net/Docs/PyQt4/qnetworkcookie.html#parseCookies
http://jeanphix.me/Ghost.py/
http://ghost-py.readthedocs.org/en/latest/
https://github.com/jeanphix/Ghost.py
代码:
import ghost g = ghost.Ghost() with g.start() as session: head={"Referer": "https://www.zoomeye.org/"} page, extra_resources = session.open("https://www.zoomeye.org/",method='get', headers=head,wait=True,encode_url=True, user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0") for element in session.cookies: print element.toRawForm()
输出:
__jsluid=a94170e7f36908f508f4bdbfdcb69461; domain=www.zoomeye.org; path=/ __jsl_clearance=1455422745.375|0|ZTSyyuKTql2jEPtDBqdlp1xe1d8%3D; expires=Sun, 14-Feb-2016 05:05:45 GMT; domain=www.zoomeye.org; path=/
最后将两段代码整合一下就能得到正常的网页数据了
代码:
# -*- coding: utf-8 -*- import httplib,ssl, urllib2, socket, ghost from bs4 import BeautifulSoup #https证书安装类 class HTTPSConnectionV3(httplib.HTTPSConnection): def __init__(self,*args,**kwargs): httplib.HTTPSConnection.__init__(self,*args,**kwargs) def connect(self): sock= socket.create_connection((self.host,self.port),self.timeout) if self._tunnel_host: self.sock= sock self._tunnel() try: self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_TLSv1) #选择证书类型 except ssl.SSLError,e: print("TryingSSLv3.") self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_SSLv23) class HTTPSHandlerV3(urllib2.HTTPSHandler): def https_open(self,req): return self.do_open(HTTPSConnectionV3,req) #安装证书 urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3())) #获取zoomeye的cookie def getcookie(): g = ghost.Ghost() with g.start() as session: mycookielist=[] head={"Referer": "https://www.zoomeye.org/"} page, extra_resources = session.open("https://www.zoomeye.org/",method='get', headers=head,wait=True,encode_url=True, user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0") for element in session.cookies: mycookielist.append(element.toRawForm().split(";")) cookiestr= mycookielist[0][0]+";"+mycookielist[1][0] return cookiestr #发送http请求 def sendhttp(): url="https://www.zoomeye.org/" heads={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0"} #heads["Cookie"]="__jsluid=260219e8b0e3b3574a2e28d15e8249ff;__jsl_clearance=1455352838.397|0|uSSCfF4umEIhqMz8GpcSDVBL9MI%3D" #heads["Cookie"]="__jsluid=9222a6c2cf192a864941182010a25d9b;__jsl_clearance=1455356240.603|0|50xi3gYoENuTPXaeWIhLW%2F00C2Y%3D;" heads["Cookie"]=getcookie() try: r=urllib2.Request(url,headers=heads) resual= urllib2.urlopen(r) return resual.read() except urllib2.URLError, e: print e.read() #主程序 if __name__== "__main__": htmlstr =sendhttp() print htmlstr