03：requests与BeautifulSoup结合爬取网页数据应用

1.1 爬虫相关模块命令回顾

　　1、requests模块

1、 pip install requests

2、 response = requests.get('http://www.baidu.com/ ') #获取指定url的网页内容

3、 response.text #获取文本文件

4、 response.content #获取字节类型

5、 response.encoding = ‘utf-8’ #指定获取的网页内容用utf-8编码

response.encoding = response.apparent_encoding #下载的页面是什么编码就用什么编码格式

6、 response.cookies #拿到cookies

response.cookies.get_dict() #拿到cookie字典样式

2、beautisoup模块

1、 pip install beautifulsoup4

2、把文本转成对象

　　　　　　　　1）html.parser 是python内置模块无需安装

　　　　　　　　　　soup = BeautiSoup(response.text,parser='html.parser')

　　　　　　　　2）lxml是第三方库，但是性能好（生产用这个

soup = BeautifulSoup(response.text,features='lxml')

3、 .find()用法：返回的是对象

　　　　　　　　1）从爬取的内容找到id="auto-channel-lazyload-article" 中div的内容

target = soup.find(id="auto-channel-lazyload-article")

　　　　　　　　2）从爬取的内容中找到一个div，并且这个div有一个属性是id=’i1’

target = soup.find('div',id='i1')

4、 .find_all()用法：返回的是对象列表

1）从以后取的target对象中找到所有li标签

li_list = target.find_all('li')

5、从.find()获取的对象中找到想要的属性

　　　　　　　　a.attrs.get('href') #获取所有a标签的所有href属性（a标签url路径）

　　　　　　　　a.find('h3').text #找到a标签中的所有h3标签，的内容

　　　　　　　　img_url = a.find('img').attrs.get('src') #从a标签中找到img标签所有src属性(图片url路径)

1.2 爬取需要登录和不需要登录页面内容的方法

import requests

from bs4 import BeautifulSoup

response = requests.get(

   url='http://www.autohome.com.cn/news/'

)

response.encoding = response.apparent_encoding          #下载的页面是什么编码就用什么编码格式

#1 把文本转成对象，

#soup = BeautifulSoup(response.text,features='lxml')        #lxml是第三方库，但是性能好（生产用这个）

soup = BeautifulSoup(response.text,features='html.parser')  # html.parser 是python内置模块无需安装

#2 从爬取的内容找到id="auto-channel-lazyload-article" 中div的内容

target = soup.find(id="auto-channel-lazyload-article")

#3.1 找到所有li标签 .find()是找到第一个

#3.2 也可以这样用： .find('div',id='i1')  可以使用这种组合查找的方法

#3.3 .find()找到的是对象，.find_all() 获取的是列表

li_list = target.find_all('li')

for i in li_list:

   a = i.find('a')

   if a:

      print(a.attrs.get('href'))                   #获取所有a标签的url路径

      # a.find('h3') 获取的是对象， 加上 .text才是获取文本

      txt = a.find('h3').text                      #从a标签中找到所有h3标签的值

      print(txt,type(txt))

      img_url = a.find('img').attrs.get('src')#从a标签中找到img标签所有src属性(图片url路径)

      import uuid

      file_name = str(uuid.uuid4()) + '.jpg'

      if img_url.startswith('//www2'):        #由于获取的图片url做了处理，所以才这样处理

         img_url2 = img_url.replace('//www2','http://www3')

         img_response = requests.get(url=img_url2)

         with open(file_name,'wb') as f:

            f.write(img_response.content)       #把图片写到本地

例1：爬取汽车之家新闻页面（爬取无需登录的网页）

import requests

#1 登录抽屉网站的用户名和密码放到字典里

post_dict = {

   "phone":'',

   'password':'',

   'oneMonth':1

}

#2 将密码字典以post方式提交到抽屉的登录界面

response = requests.post(

   url = 'http://dig.chouti.com/login',

   data=post_dict

)

#3下面就是成功登录抽屉的返回值

print(response.text)

# {"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_49844923242"}}}

#4 下面是打印成功登录抽屉后返回的的cookie字典

cookie_dict = response.cookies.get_dict()

print(cookie_dict)

#{'JSESSIONID': 'aaaVizwwcod_L5QcwwR9v', 'puid': 'd332ef55361217e544b91f081090ad5e',

#  'route': '37316285ff8286c7a96cd0b03d38e13b', 'gpsd': 'f8b07e259141ae5a11d930334fbfb609'}

#5 当我们每次需要访问抽屉登录后才能看的信息时，就可以在url中添加登录成返回的cookie字典

response=requests.get(

   url='http://dig.chouti.com/profile',

   cookies = cookie_dict

)

例2：自动登录抽屉并获取用户配置页面的信息（cookie方式）

1.3 使用爬虫登录案例总结

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import requests

# ## 1、首先登陆任何页面，获取cookie

i1 = requests.get(url="http://dig.chouti.com/")

i1_cookies = i1.cookies.get_dict()

# ## 2、用户登陆，携带上一次的cookie，后台对cookie中的 gpsd 进行授权

i2 = requests.post(

    url="http://dig.chouti.com/login",

    data={

        'phone': "",

        'password': "7481079xl",

        'oneMonth': ""

    },

    cookies=i1_cookies

)

# ## 3、点赞（只需要携带已经被授权的gpsd即可）

gpsd = i1_cookies['gpsd']

i3 = requests.post(

    url="http://dig.chouti.com/link/vote?linksId=15074576",

    cookies={'gpsd': gpsd}

)

print(i3.text)

例1：方式一: 使用cookie方式点赞抽屉

import requests

session = requests.Session()

i1 = session.get(url="http://dig.chouti.com/help/service")

i2 = session.post(

    url="http://dig.chouti.com/login",

    data={

        'phone': "",

        'password': "7481079xl",

        'oneMonth': ""

    },

)

i3 = session.post(

    url="http://dig.chouti.com/link/vote?linksId=15074576"

)

print(i3.text)

例2：方式二: 使用session方式点赞抽屉

import requests

from bs4 import BeautifulSoup

# 第一步：获取csrf

# 1.1 获取login页面

r1 = requests.get(url='https://github.com/login')

# 1.2 接文本文件解析成对象

b1 = BeautifulSoup(r1.text,'html.parser')

# 1.3 找到csrf_token标签

tag = b1.find(name='input',attrs={'name':'authenticity_token'})

#1.4 获取csrf_token的值

# tag.get('value')等价于 tag.attrs.get('values')

token = tag.get('value')                # 获取csrf_token的值

#1.5 获取第一次发送get请求返回的cookies字典

r1_cookie = r1.cookies.get_dict()       #获取第一次发get请求返回的cookie

print('第一次',r1_cookie)

# 第二步：发送post请求，携带用户名 密码，和第一次get请求返回的cookie，后台进行授权

#2.1 携带：csrf_token,cookies,用户名，密码 发送post请求登录

# requests.post() 等价于  requests.request('post',)

r2 = requests.post(

   url='https://github.com/session',

   data={                        #这里data字典必须和实际登录的格式相同

      'commit':'Sign in',

      'utf8':'✓',

      'authenticity_token':token,

      'login':'1532363461@qq.com',

      'password':'7481079xl',

   },

   cookies = r1_cookie,

)

#2.2 获取第二次返回的cookies字典

r2_cookie = r2.cookies.get_dict()

print('第二次',r2_cookie)

#2.3 将两次获取的cookie字典整合成一个：没有重合就用r1_cookie,有重合的就用r2_cookie更新这个字典

r1_cookie.update(r2_cookie)

# 第三步：访问个人页面，携带cookie

r3 = requests.get(

   url='https://github.com/settings/profile',

   cookies = r1_cookie,                  # 获取数据时携带登录成功的cookie

)

print(r3.text)

例3：使用爬虫登录github并获取用户配置信息

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import time

import requests

from bs4 import BeautifulSoup

session = requests.Session()

i1 = session.get(

    url='https://www.zhihu.com/#signin',

    headers={

        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',

    }

)

soup1 = BeautifulSoup(i1.text, 'lxml')

xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'})

xsrf = xsrf_tag.get('value')

current_time = time.time()

i2 = session.get(

    url='https://www.zhihu.com/captcha.gif',

    params={'r': current_time, 'type': 'login'},

    headers={

        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',

    })

with open('zhihu.gif', 'wb') as f:

    f.write(i2.content)

captcha = input('请打开zhihu.gif文件，查看并输入验证码：')

form_data = {

    "_xsrf": xsrf,

    'password': 'xxooxxoo',

    "captcha": 'captcha',

    'email': '424662508@qq.com'

}

i3 = session.post(

    url='https://www.zhihu.com/login/email',

    data=form_data,

    headers={

        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',

    }

)

i4 = session.get(

    url='https://www.zhihu.com/settings/profile',

    headers={

        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',

    }

)

soup4 = BeautifulSoup(i4.text, 'lxml')

tag = soup4.find(id='rename-section')

nick_name = tag.find('span',class_='name').string

print(nick_name)

例4：登录知乎

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import re

import json

import base64

import rsa

import requests

def js_encrypt(text):

    b64der = 'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB'

    der = base64.standard_b64decode(b64der)

    pk = rsa.PublicKey.load_pkcs1_openssl_der(der)

    v1 = rsa.encrypt(bytes(text, 'utf8'), pk)

    value = base64.encodebytes(v1).replace(b'\n', b'')

    value = value.decode('utf8')

    return value

session = requests.Session()

i1 = session.get('https://passport.cnblogs.com/user/signin')

rep = re.compile("'VerificationToken': '(.*)'")

v = re.search(rep, i1.text)

verification_token = v.group(1)

form_data = {

    'input1': js_encrypt('wptawy'),

    'input2': js_encrypt('asdfasdf'),

    'remember': False

}

i2 = session.post(url='https://passport.cnblogs.com/user/signin',

                  data=json.dumps(form_data),

                  headers={

                      'Content-Type': 'application/json; charset=UTF-8',

                      'X-Requested-With': 'XMLHttpRequest',

                      'VerificationToken': verification_token}

                  )

i3 = session.get(url='https://i.cnblogs.com/EditDiary.aspx')

print(i3.text)

例5：登录博客园