爬zoomeye遇到的坑和解决方案

时间:2021-05-24 20:56:34

坑一:

https的加密方式是TLSv1.2,这个导致很多以前的爬虫都不能用了,解决方法是用python 的ssl库代码如下:

参考的文章:http://www.cnblogs.com/vingi/articles/4131633.html

代码:

# -*- coding: utf-8 -*-
import httplib,ssl, urllib2, socket
class HTTPSConnectionV3(httplib.HTTPSConnection):
    def __init__(self,*args,**kwargs):
        httplib.HTTPSConnection.__init__(self,*args,**kwargs)
         
    def connect(self):
        sock= socket.create_connection((self.host,self.port),self.timeout)
        if self._tunnel_host:
            self.sock= sock
            self._tunnel()
        try:
            self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_TLSv1)
        except ssl.SSLError,e:
            print("TryingSSLv3.")
            self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_SSLv23)
             
class HTTPSHandlerV3(urllib2.HTTPSHandler):
    def https_open(self,req):
        return self.do_open(HTTPSConnectionV3,req)
#安装证书
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))
 
if __name__== "__main__":
    url="https://www.zoomeye.org/"
    heads={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0"}
    try:
        r=urllib2.Request(url,headers=heads)
        resual= urllib2.urlopen(r)
        print resual.read()
    except urllib2.URLError, e:
        print e.read()

输出的是:

<script>var dc="";var t_d={hello:"world",t_c:function(x){if(x==="")return;if(x.slice(-1)===";"){x=x+" ";};if(x.slice(-2)!=="; "){x=x+"; ";};dc=dc+x;}};(function(a){eval(function(p,a,c,k,e,d){e=function(c){return(c<a?"":e(parseInt(c/a)))+((c=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1;};while(c--)if(k[c])p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c]);return p;}('b d=[3,4,1,2,0];b o=[];b p=0;g(b i=0;i<d.f;i++){d[i]=a[d[i]]}d=d.j(\'\').l(\';\');g(b i=0;i<d.f;i++){h.m(d[i])}n("e.c=e.c.q(/[\\?|&]s-r/, \'\')",k);',29,29,'|||||||||||var|href||location|length|for|t_d||join|1500|split|t_c|setTimeout|||replace|challenge|captcha'.split('|'),0,{}));})(['14-Feb-16 04:46:09 GMT;Path=/;', 'XHhxKtGuWK1VUCL37XI%3D;Exp', 'ires=Sun, ', '__jsl_clearance=145542', '1569.853|0|kkQeowPJ']);document.cookie=dc;</script>

坑二:

虽然安装了正确的证书,但是还是不能拿到正常的网页,通过抓包,分析包,我发现必须带上正确的cookie才能请求到正常的数据

如何才能拿到正确的cookie呢?这里用到一个python的webkit:Ghost.py

参考的文章:

http://pyqt.sourceforge.net/Docs/PyQt4/qnetworkcookie.html#parseCookies
http://jeanphix.me/Ghost.py/
http://ghost-py.readthedocs.org/en/latest/
https://github.com/jeanphix/Ghost.py


代码:


import ghost
g = ghost.Ghost()
with g.start() as session:
    head={"Referer": "https://www.zoomeye.org/"}
    page, extra_resources = session.open("https://www.zoomeye.org/",method='get', headers=head,wait=True,encode_url=True, user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0")
    for element in session.cookies:
        print element.toRawForm()

输出:


__jsluid=a94170e7f36908f508f4bdbfdcb69461; domain=www.zoomeye.org; path=/
__jsl_clearance=1455422745.375|0|ZTSyyuKTql2jEPtDBqdlp1xe1d8%3D; expires=Sun, 14-Feb-2016 05:05:45 GMT; domain=www.zoomeye.org; path=/

最后将两段代码整合一下就能得到正常的网页数据了


代码:


# -*- coding: utf-8 -*-
import httplib,ssl, urllib2, socket, ghost
from bs4 import BeautifulSoup
#https证书安装类
class HTTPSConnectionV3(httplib.HTTPSConnection):
    def __init__(self,*args,**kwargs):
        httplib.HTTPSConnection.__init__(self,*args,**kwargs)
         
    def connect(self):
        sock= socket.create_connection((self.host,self.port),self.timeout)
        if self._tunnel_host:
            self.sock= sock
            self._tunnel()
        try:
            self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_TLSv1) #选择证书类型
        except ssl.SSLError,e:
            print("TryingSSLv3.")
            self.sock= ssl.wrap_socket(sock,self.key_file,self.cert_file,ssl_version=ssl.PROTOCOL_SSLv23)
             
class HTTPSHandlerV3(urllib2.HTTPSHandler):
    def https_open(self,req):
        return self.do_open(HTTPSConnectionV3,req)
#安装证书
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))

#获取zoomeye的cookie
def getcookie():
    g = ghost.Ghost()
    with g.start() as session:
        mycookielist=[]
        head={"Referer": "https://www.zoomeye.org/"}
        page, extra_resources = session.open("https://www.zoomeye.org/",method='get', headers=head,wait=True,encode_url=True, user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0")
        for element in session.cookies:
            mycookielist.append(element.toRawForm().split(";"))
        cookiestr= mycookielist[0][0]+";"+mycookielist[1][0]
        return cookiestr

#发送http请求
def sendhttp():
    url="https://www.zoomeye.org/"
    heads={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0"}
    #heads["Cookie"]="__jsluid=260219e8b0e3b3574a2e28d15e8249ff;__jsl_clearance=1455352838.397|0|uSSCfF4umEIhqMz8GpcSDVBL9MI%3D"
    #heads["Cookie"]="__jsluid=9222a6c2cf192a864941182010a25d9b;__jsl_clearance=1455356240.603|0|50xi3gYoENuTPXaeWIhLW%2F00C2Y%3D;"
    heads["Cookie"]=getcookie()
    try:
        r=urllib2.Request(url,headers=heads)
        resual= urllib2.urlopen(r)
        return resual.read()
    except urllib2.URLError, e:
        print e.read()

#主程序
if __name__== "__main__":
    htmlstr =sendhttp()
    print htmlstr