python之cookie, cookiejar 模拟登录绕过验证

0.思路

如果懒得模拟登录，或者模拟登录过于复杂（多步交互或复杂验证码）则人工登录后手动复制cookie（或者代码读取浏览器cookie），缺点是容易过期。

如果登录是简单的提交表单，代码第一步模拟登录，第二步通过cookiejar访问目标url。

1.参考

python处理cookie详解

李劼杰的博客

Python使用Cookie字符串发起HTTP请求的几个方法(1)

Python使用Cookie字符串发起HTTP请求的几个方法(2)

Python使用Chrome浏览器的Cookies发起HTTP请求

fuck-login/001 zhihu/zhihu.py 一系列网站登录！

Python 爬虫之模拟知乎登录

try:

    import cookielib

except:

    import http.cookiejar as cookielib  #兼容python3

requests.session

# 使用登录cookie信息

session = requests.session()

session.cookies = cookielib.LWPCookieJar(filename='cookies')

try:

    session.cookies.load(ignore_discard=True)

except:

print("Cookie 未能加载")

# 保存 cookies 到文件，

# 下次可以使用 cookie 直接登录，不需要输入账号和密码

session.cookies.save()

IE/Firefox/Chrome等浏览器保存Cookie的位置

中大黑熊 cookielib和urllib2模块相结合模拟网站登录

现代魔法学院用Python模拟登录网站

python sqlite3查看数据库所有表(table)

Python 爬虫解决登录问题的另类方法

　　获取浏览器的 Cookies, 然后让 requests 这个库来直接使用登录好的 Cookies

Chrome 33+浏览器 Cookies encrypted_value解密脚本（python实现）

python3读取chrome浏览器cookies

10行代码爬取微信公众号文章评论

打开 Chrome 浏览器你会看到发送请求时会自动把 Cookie 信息发送给微信，我们就把这段 Cookie 数据拷贝出来，用 Python 构建一个 Cookie 对象，给 requests 使用。

from http.cookies import SimpleCookie

raw_cookie = "gsScrollPos-5517=; ..中间还省略很多... bizuin=2393828"

cookie = SimpleCookie(raw_cookie)

requests_cookies = dict([(c, cookie[c].value) for c in cookie])

r = requests.get(url, cookies=requests_cookies)

同时打开 fiddler 抓取 https 报错 urllib2.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:590)>

https://imaojia.com/blog/questions/urlerror-urlopen-error-ssl-certificate-verify-failed-certificate-verify-failed-ssl-c-590/

https://www.python.org/dev/peps/pep-0476/

2.最快用法 fiddler request Raw格式 + request

fiddler 全选复制，避免结尾多余空行

import requests

#fiddler request Raw ctrl+a 全选复制

#GET则lines[-1]为'', POSt则lines[-2]为'', lines[-1]为body, 表单才会同 url query

with open('headers.txt') as f:

    lines = [i.strip() for i in f.readlines()]

#fiddler request Raw 的起始行为完整URl？！

(method, url, _) = lines[0].split()

if method == 'POST':

    body = lines[-1]

    lines = lines[1:-2]

else:

    lines = lines[1:-1]

headers = {}

for line in lines:

    k, v = line.split(': ',1)  #:注意后面有空格

    headers[k] = v

#requests 自动处理3xx，比如xueqiu.com自动跳转个人首页

if method == 'POST':

    data = dict([i.split('=', 1) for i in body.split('&')])  #这里只考虑了表单 POST，否则可以直接传入data=string

    r = requests.post(url, headers=headers, data=data, verify=False)

else:

    r = requests.get(url, headers=headers, verify=False)

3.最古老用法 urllib2 + cookiejar

# -*- coding: utf-8 -*-

import os

import urllib, urllib2

try:

    import cookielib

except:

    import http.cookiejar as cookielib  #兼容python3

# https://imaojia.com/blog/questions/urlerror-urlopen-error-ssl-certificate-verify-failed-certificate-verify-failed-ssl-c-590/

# https://www.python.org/dev/peps/pep-0476/

import ssl

# 全局关闭证书验证，不建议

try:

    _create_unverified_https_context = ssl._create_unverified_context

except AttributeError:

    # Legacy Python that doesn't verify HTTPS certificates by default

    pass

else:

    # Handle target environment that doesn't support HTTPS verification

    ssl._create_default_https_context = _create_unverified_https_context

# 或者创建未经验证的上下文

# context = ssl._create_unverified_context()

# print urllib2.urlopen("https://imaojia.com/", context=context).read()

def login_xueqiu():

    # chrome隐身 左上角安全锁 正在使用cookie 删除

    # fiddler request Raw:

    """

    POST https://xueqiu.com/snowman/login HTTP/1.1

    Host: xueqiu.com

    Connection: keep-alive

    Content-Length: 72

    Pragma: no-cache

    Cache-Control: no-cache

    Accept: */*

    Origin: https://xueqiu.com

    X-Requested-With: XMLHttpRequest

    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36

    Content-Type: application/x-www-form-urlencoded; charset=UTF-8

    Referer: https://xueqiu.com/

    Accept-Encoding: gzip, deflate, br

    Accept-Language: zh-CN,zh;q=0.9

    Cookie: aliyungf_tc=...

    remember_me=true&username=xxx%40139.com&password=xxx&captcha=

    """

    url_login = 'https://xueqiu.com/snowman/login'

    url_somebody = 'https://xueqiu.com/u/6146070786'

    data_dict = {

    'remember_me': 'true',  #true false

    'username': os.getenv('xueqiu_username'),

    'password': os.getenv('xueqiu_password'),

    }

    # 注意需要转换为 URL query string

    data = urllib.urlencode(data_dict)    

    headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',

    'X-Requested-With': 'XMLHttpRequest',  #表明是AJax异步,否则是传统同步请求  注释掉：urllib2.HTTPError: HTTP Error 404: Not Found

    }

    # urllib2.Request(self, url, data=None, headers={}, origin_req_host=None, unverifiable=False)

    req = urllib2.Request(url_login, data, headers)

    # 参考写法 C:\Program Files\Anaconda2\Lib\urllib2.py

    cookiejar = cookielib.CookieJar()

    handler = urllib2.HTTPCookieProcessor(cookiejar)  #理论上可以不传参数，但是后面无法使用 cookiejar

    ck_opener = urllib2.build_opener(handler)

    resp = ck_opener.open(req) 

    # print(resp.headers)

    # for i in cookiejar:

        # print(i)    

    req = urllib2.Request(url_somebody)

    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0')

    # print(req.headers)  #{'User-agent': 'xxx'}

    # fiddler 抓包：

    """

    GET https://xueqiu.com/u/6146070786 HTTP/1.1

    Host: xueqiu.com

    Accept-Encoding: identity

    User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0

    Cookie: remember=1; xq_is_login=1; xq_a_token.sig=xxx...

    Connection: close

    """

    # resp = ck_opener.open(req)

    #安装opener,此后调用urlopen()时都会使用安装过的opener对象

    urllib2.install_opener(ck_opener)

    resp = urllib2.urlopen(req)

    html = resp.read()

    assert os.getenv('xueqiu_nickname') in html

    # with open('login_xueqiu.html','wb') as f:

        # f.write(html)

    # assert u'登录' not in html.decode('utf-8')

if __name__ == '__main__':

    login_xueqiu()

4.构造cookiejar

4.1 从cookie字符串生成cookiejar

def get_cookjar_from_cookie_str(cookie, domain, path='/'):

    cookiejar = cookielib.CookieJar()

    simple_cookie = Cookie.SimpleCookie(cookie)

    # 上述SimpleCookie不能直接使用，因为一个完整的Cookie，还必须包括额外的字段，如:domain、path、expires等。

    # 第二步工作是创建cooklib.Cookie对象，直接将key, value传入cooklib.Cookie类的构造函数即可得到

    # 一系列cookielib.Cookie对象，便可以依次用它们来更新CookieJar了。

    for c in simple_cookie:

        cookie_item = cookielib.Cookie(

            version=0, name=c, value=str(simple_cookie[c].value),

                     port=None, port_specified=None,

                     domain=domain, domain_specified=None, domain_initial_dot=None,

                     path=path, path_specified=None,

                     secure=None,

                     expires=None,

                     discard=None,

                     comment=None,

                     comment_url=None,

                     rest=None,

                     rfc2109=False,

            )

        cookiejar.set_cookie(cookie_item)

    return cookiejar

4.2 解析浏览器cookie文件生成cookiejar (借助sqlite3, win32crypt.CryptUnprotectData)

def parse_browser_cookie_file(browser='chrome', domain=None):

    cookie_file_path_temp = 'cookies_temp'

    if browser == 'chrome':

        # 'C:\\Users\\win7\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Cookies'

        cookie_file_path = os.path.join(os.environ['LOCALAPPDATA'], r'Google\Chrome\User Data\Default\Cookies')

    elif browser == 'firefox':

        # r'C:\Users\win7\AppData\Roaming\Mozilla\Firefox\Profiles\owmkid1w.default\cookies.sqlite'

        # cookie_file_path = os.path.join(os.environ['APPDATA'], r'Mozilla\Firefox\Profiles\owmkid1w.default\cookies.sqlite')

        firefox_dir_path = os.path.join(os.environ['APPDATA'], r'Mozilla\Firefox\Profiles')

        result = []

        for path in os.listdir(firefox_dir_path):

            path = os.path.join(firefox_dir_path, path, 'cookies.sqlite')

            if os.path.exists(path):

                result.append(path)

        # 存在几个 xxx.default 文件夹，选择其中文件最大的

        cookie_file_path = sorted(result, key=lambda x: os.stat(x).st_size, reverse=True)[0]        

    if not os.path.exists(cookie_file_path):

        raise Exception('Cookies file not exist!')

    # os.system('copy "%s" D:\\python-chrome-cookies'%cookie_file_path)  #出现空格 不能省略""不能写成 D:/

    # os.system('copy %s %s'%('d:\\123.txt','e:\\123.txt'))

    # sqlite3.OperationalError: database is locked

    shutil.copy(cookie_file_path, cookie_file_path_temp)  #'d:/cookies'

    conn = sqlite3.connect(cookie_file_path_temp)

    c = conn.cursor()

    # 或者右键选择打开方式 SQLiteSpy.exe

    # python sqlite3查看数据库所有表(table)

    # http://www.cnblogs.com/doudongchun/p/3694803.html

        # In [139]: c = conn.cursor()

    # 查看某数据库中所有表

        # In [140]: c.execute("select name from sqlite_master where type='table' order by name")

        # Out[140]: <sqlite3.Cursor at 0x9648d50>

        # In [141]: print c.fetchall()

        # [(u'cookies',), (u'meta',)]

    # 查看表结构

        # In [148]: c.execute("PRAGMA table_info('cookies')")

        # Out[148]: <sqlite3.Cursor at 0x9648d50>

        # In [149]: print c.fetchall()

        # [(0, u'creation_utc', u'INTEGER', 1, None, 1), (1, u'host_key', u'TEXT', 1, None, 0), (2, u'name', u'TEXT', 1, None, 0), (3, u'value', u'TEXT', 1, None, 0), (4, u'path', u'TEXT', 1, None, 0), (5, u'expires_utc', u'INTEGER', 1, None, 0), (6, u'secure', u'INTEGER', 1, None, 0), (7, u'httponly', u'INTEGER', 1, None, 0), (8, u'last_access_utc', u'INTEG

        # ER', 1, None, 0), (9, u'has_expires', u'INTEGER', 1, u'1', 0), (10, u'persistent', u'INTEGER', 1, u'1', 0), (11, u'priority', u'INTEGER', 1, u'1', 0), (12, u'encrypted_value', u'BLOB', 0, u"''", 0), (13, u'firstpartyonly', u'INTEGER', 1, u'0', 0)]  

    # Python 爬虫解决登录问题的另类方法

    # https://jecvay.com/2015/03/python-chrome-cookies.html

    # (12, u'encrypted_value', u'BLOB', 0, u"''", 0)  倒数第二个数据被加密   <read-write buffer ptr 0x00000000093FD188, size 230 at 0x00000000093FD150>

    # In [177]: sql = 'select * from cookies  where host_key like "%xueqiu.com%"'

    # [(13119251368696887L, u'.xueqiu.com', u's', u'', u'/', 13150787368696887L, 0, 1, 13146029373373314L, 1, 1, 1, <read-write buffer ptr 0x00000000093FD188, size 230 at 0x00000000093FD150>, 0)

    # Chrome 33+浏览器 Cookies encrypted_value解密脚本（python实现）

    # http://www.ftium4.com/chrome-cookies-encrypted-value-python.html

    # Chrome浏览器版本33以上对Cookies进行了加密，用SQLite Developer打开Chrome的Cookies文件就会发现，

    # 原来的value字段已经为空，取而代之的是加密的encrypted_value。

    c.execute("select name from sqlite_master where type='table' order by name")

    print c.fetchall()

    if browser == 'chrome':

        sql = 'select host_key, name, encrypted_value, path from cookies'

        if domain:

            sql += ' where host_key like "%{}%"'.format(domain)

    elif browser == 'firefox':

        sql = 'select host, name, value, path from moz_cookies'

        if domain:

            sql += ' where host like "%{}%"'.format(domain)    

    cookie_dict = {}

    cookiejar = cookielib.CookieJar()

    # rst=c.execute(sql)

    # type(rst) #sqlite3.Cursor

    for row in c.execute(sql):    # conn.execute(sql) 不标准

        # print type(row  #<type 'tuple'>

        if browser == 'chrome':

            ret = win32crypt.CryptUnprotectData(row[2], None, None, None, 0)

            value = ret[1].decode()

        elif browser == 'firefox':

            value = row[2]

        cookie_dict[row[1]] = value

        cookie_item = cookielib.Cookie(

            version=0, name=row[1], value=value,

                     port=None, port_specified=None,

                     domain=row[0], domain_specified=None, domain_initial_dot=None,

                     path=row[3], path_specified=None,

                     secure=None,

                     expires=None,

                     discard=None,

                     comment=None,

                     comment_url=None,

                     rest=None,

                     rfc2109=False,

            )

        cookiejar.set_cookie(cookie_item)    # Apply each cookie_item to cookiejar

    # print cookie_dict

    conn.close()

    os.remove(cookie_file_path_temp)

    cookie_str = ';'.join(['%s=%s'%(k,v) for k,v in cookie_dict.items()])

    return (cookiejar, cookie_dict, cookie_str)

5.使用cookie字符串或cookiejar

5.1 在 urllib2.urlopen(req) 中req.add_header('Cookie',复制的cookie字符串)

import ssl

context = ssl._create_unverified_context()

def urllib2_Request_with_cookie_str(url, cookie, verify):

    cookie = re.sub('\n', '', cookie)

    # urllib2.Request(self, url, data=None, headers={}, origin_req_host=None, unverifiable=False)

    req = urllib2.Request(url)

    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0')

    req.add_header('Cookie',cookie)

    # urllib2.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:590)>

    try:

        resp = urllib2.urlopen(req)

    except urllib2.URLError as err:

        print err

        resp = urllib2.urlopen(req, context=context)  # 同时打开fiddler的影响  

    html_doc = resp.read()

    with open('urllib2_Request_with_cookie_str.html','wb') as f:

        f.write(html_doc)

    print 'urllib2_Request_with_cookie_str', url, verify, verify in html_doc

5.2 requests.get 传参 cookies=cookie字符串/dict/cookiejar

import Cookie

# cookie 接受类型：str, dict, cookiejar

def requests_with_cookie(url, cookie, verify):

    # requests cookie 接受 dict 或 cookiejar，需要将字符串转dict

    if isinstance(cookie, basestring):

        if isinstance(cookie, unicode):

            cookie = cookie.encode('utf-8')

        cookie = re.sub('\n', '', cookie)

        # SimpleCookie supports strings as cookie values.

        simple_cookie = Cookie.SimpleCookie(cookie)

        cookie = dict([(c, simple_cookie[c].value) for c in simple_cookie])

        # 10行代码爬取微信公众号文章评论

        # https://mp.weixin.qq.com/s/Qbeyk2hncKDaz1iT54iwTA

        # 把这段 Cookie 数据拷贝出来，用 Python 构建一个 Cookie 对象，给 requests 使用。

        # simple_cookie = Cookie.SimpleCookie(cookie)

        # from http.cookies import SimpleCookie

        # simple_cookie = SimpleCookie(cookie)

    # 字典最后一项多出逗号也无妨

    headers = {

    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",

    }

    # requests.exceptions.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:590)

    # https://*.com/questions/10667960/python-requests-throwing-up-sslerror

    # 简单直接的解决办法 verify=False

    try:

        r = requests.get(url, headers=headers, cookies=cookie)

    except requests.exceptions.SSLError as err:

        print err

        r = requests.get(url, headers=headers, cookies=cookie, verify=False)

    print 'requests_with_cookie', url, verify, verify in r.content

    with open('requests_with_cookie.html','wb') as f:

        f.write(r.content)

5.3 urllib2.build_opener传入 urllib2.HTTPCookieProcessor(cookiejar)

import ssl

#ssl._create_default_https_context = ssl._create_unverified_context

context = ssl._create_unverified_context()

def opener_with_cookiejar(url, cookie, verify): 

    req = urllib2.Request(url)

    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0')

    # handler = urllib2.HTTPCookieProcessor(cookie)

    # opener = urllib2.build_opener(handler)  

    # 参考添加 context 参数， 否则得用全局ssl设置

    # C:\Program Files\Anaconda2\Lib\urllib2.py

        # def urlopen

            # elif context:

                # https_handler = HTTPSHandler(context=context)

                # opener = build_opener(https_handler)   

    # urllib2.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:590)>

    try:

        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))

        resp = opener.open(req)

    except urllib2.URLError as err:

        print err

        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar), urllib2.HTTPSHandler(context=context))  #叠加多个 handler

        resp = opener.open(req)

    html_doc = resp.read()

    with open('opener_with_cookiejar.html','wb') as f:

        f.write(html_doc)

    print 'opener_with_cookiejar', url, verify, verify in html_doc

5.4 更加底层 httplib.HTTPConnection 传入 cookie字符串

import httplib

import urlparse

# 不需要用到 import ssl 的设置!!!速度快！！！

def httplib_conn_with_cookie_str(url, cookie, verify):

    # url = 'https://xueqiu.com'

    url_ori = url

    cookie = re.sub('\n', '', cookie)

    ret = urlparse.urlparse(url)    # Parse input URL

    if ret.scheme == 'http':

        conn = httplib.HTTPConnection(ret.netloc)

    elif ret.scheme == 'https':

        conn = httplib.HTTPSConnection(ret.netloc)

    url = ret.path

    if ret.query: url += '?' + ret.query

    if ret.fragment: url += '#' + ret.fragment

    if not url: url = '/'

    print url

    conn.request(method='GET', url=url , headers={'Cookie': cookie})

    # 如果传入url = 'https://xueqiu.com' ,返回内容为：

    # Redirecting to <a href="/4xxxxxxxxx/">/4xxxxxxxxx/</a>.

    # 却没有处理重导向！

    resp = conn.getresponse()

    html_doc = resp.read()

    with open('httplib_conn_with_cookie_str.html','wb') as f:

        f.write(html_doc)

    print 'httplib_conn_with_cookie_str', url_ori, verify, verify in html_doc

6.第三方库

https://pypi.python.org/pypi/browser-cookie3/0.6.1

https://pypi.python.org/pypi/browsercookie/0.7.2

秒客网