python16_day36【爬虫1】

时间:2020-12-04 07:29:28

一、requests

  1. GET请求

 # 1、无参数实例

 import requests

 ret = requests.get('https://github.com/timeline.json')

 print(ret.url)
print(ret.text) # 2、有参数实例 import requests payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.get("http://httpbin.org/get", params=payload) print(ret.url)
print(ret.text)

  2.POST请求

 # 1、基本POST实例

 import requests

 payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.post("http://httpbin.org/post", data=payload) print(ret.text) # 2、发送请求头和数据实例 import requests
import json url = 'https://api.github.com/some/endpoint'
payload = {'some': 'data'}
headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print(ret.text)
print(ret.cookies)

  3.其它请求  

 requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs) # 以上方法均是在此方法的基础上构建
requests.request(method, url, **kwargs)

  4.汽车之家新闻

 import requests
from bs4 import BeautifulSoup # response = requests.get("http://www.autohome.com.cn/news/")
# # response.text 是str
# # response.content 是bytes二进制
#
# response.encoding = 'gbk' # 网站使用了gbk
# root = BeautifulSoup(response.text, 'html.parser') # 将返回结果拿到用bs解析
# outer_div_obj = root.find(name='div', id='auto-channel-lazyload-article') # 打到div id='xx'
# li_obj_list = outer_div_obj.find_all(name='li') # 拿到里面所有的LI
#
# for li_obj in li_obj_list:
# if not li_obj.find('h3'):
# continue
# title_obj = li_obj.find('h3') # 拿到对象 H3标签
# summary_obj = li_obj.find('p') # 拿到对象 P标签
# img_obj = li_obj.find('img') # 拿到对象 IMG标签
# src = img_obj.attrs.get('src') # 从IMG标签对象中拿到src属性
#
# print(src, title_obj.text, summary_obj.text) response = requests.get("http://www.autohome.com.cn/news/")
response.encoding = 'gbk' soup = BeautifulSoup(response.text, 'html.parser')
tag = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})
li_list = tag.find_all('li') for li in li_list:
h3 = li.find('h3') if not h3:
continue
print("\033[33;1m标题: {0}\033[0m".format(h3.text))
print("\033[34;1m路径: http://{0}\033[0m".format(li.find('img').attrs['src']))
print("\033[34;1m内容: {0}\033[0m".format(li.find('p').text))

  5.github登录

 #!/usr/bin/env python
# -*-coding:utf8-*-
# __author__ = "willian" import requests
from bs4 import BeautifulSoup
# 第一次请求: 获取 token and cookie
r1 = requests.get('https://github.com/login')
b1 = BeautifulSoup(r1.text, 'html.parser')
# get token
auth_token = b1.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
# get cookies
r1_cookie_dict = r1.cookies.get_dict() # 第二次请求: 发送用户认证
r2 = requests.post("https://github.com/session",
data={
'commit': "Sign in",
'utf8': '✓',
'authenticity_token': auth_token,
'login': '',
'password': ""
}, cookies=r1_cookie_dict)
# get cookies
r2_cookie_dict = r2.cookies.get_dict() # 将两次的cookies合并
all_cookie_dict = {}
all_cookie_dict.update(r1_cookie_dict)
all_cookie_dict.update(r2_cookie_dict) # 第三次请求:只有登录成功之后获取个人页面
r3 = requests.get('https://github.com/settings/emails', cookies=all_cookie_dict)
print(r3.text)

  6.抽屉点赞

 #!/usr/bin/env python
# -*-coding:utf8-*-
# __author__ = "willian" import requests
from bs4 import BeautifulSoup # 1. 请求获取cookies
r0 = requests.get("http://dig.chouti.com")
r0_cookie_dict = r0.cookies.get_dict() # 2. 授权
r1 = requests.post(
url="http://dig.chouti.com/login",
data={
'phone': 'xx',
'password': 'xx',
'oneMonth': 1
},
cookies=r0_cookie_dict
)
r1_cookie_dict = r1.cookies.get_dict() all_cookies = {}
all_cookies.update(r0_cookie_dict)
all_cookies.update(r1_cookie_dict) # 3.点赞
r2 = requests.post(url='http://dig.chouti.com/link/vote?linksId=14808951', cookies=all_cookies)
print(r2.text)

二、Beautfulsoup4

三、wechat