1.爬虫的本质是什么?
模仿浏览器的行为,爬取网页信息。
2.requests
1.get请求
无参数实例
import requests
ret = requests.get('https://github.com/timeline.json')
print ret.text 有参数实例
import requests
ret = requests.get("http://httpbin.org/get", params= {'key1': 'value1', 'key2': 'value2'}) print ret.text
get
2.post请求
import requests
import json url = 'https://api.github.com/some/endpoint'
payload = {'v1': 'k1}
headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print ret.text
3.其他请求
requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs) # 以上方法均是在此方法的基础上构建
requests.request(method, url, **kwargs)
4.更多参数与实例
- method
def param_method_url():
ret=requests.request(method='get', url='http://127.0.0.1:8000/test/')
ret=requests.request(method='post', url='http://127.0.0.1:8000/test/') - params
import requests requests.get(url='http://127.0.0.1:8000/test/',
params={'k1': 'v1', 'k2': 'v2'}) #他的本质与requests.get(url='xxxxx?k1=v1&k2=v2') - data
# 可以是字典
# 可以是字符串
# 可以是字节
# 可以是文件对象
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data={'k1': 'v1', 'k2': '水电费'}) # requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data="k1=v1; k2=v2; k3=v3; k3=v4"
# ) # requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data="k1=v1;k2=v2;k3=v3;k3=v4",
# headers={'Content-Type': 'application/x-www-form-urlencoded'}
# ) # requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4
# headers={'Content-Type': 'application/x-www-form-urlencoded'}
# ) - json
#如果请求体是 payload的话则需要传入json格式
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
json={'k1': 'v1', 'k2': '水电费'}) - cookies
ret1 = requests.get(
url='https://dig.chouti.com/',
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
} )
ret1_cookies = ret1.cookies.get_dict()
#获取的ret1.cookies是访问该url返回的cookies对象
#通过get_dict()获取到字典类型的cookies -
headers
# 发送请求头到服务器端
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
json={'k1': 'v1', 'k2': '水电费'},
headers={'Content-Type': 'application/x-www-form-urlencoded'}
)
#具体需要什么请求头要看服务器端 -
files
# 发送文件
# file_dict = {
# 'f1': open('readme', 'rb')
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict) # 发送文件,定制文件名
# file_dict = {
# 'f1': ('test.txt', open('readme', 'rb'))
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict) # 发送文件,定制文件名
# file_dict = {
# 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict) # 发送文件,定制文件名
# file_dict = {
# 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict) pass - timeout
设置超时时间,如果访问超过超时时间就停止访问
# ret = requests.get('http://google.com/', timeout=1)
# print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1))
# print(ret)
pass - allow_redirects
#是否允许重定向,默认为true
ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
print(ret.text)
BeautifulSoup
该模块可以将接收到的html和xml进行格式化,通过操作对象的方式快速的找到想要的标签
- 使用实例
from bs4 import BeautifulSoup html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
...
</body>
</html>
""" soup = BeautifulSoup(html_doc, features="lxml") - name--->标签名
# tag = soup.find('a')
# name = tag.name # 获取
# print(name)
# tag.name = 'span' # 设置 - attr--->标签属性
# tag = soup.find('a')
# attrs = tag.attrs # 获取
# print(attrs)
# tag.attrs = {'ik':123} # 设置
# tag.attrs['id'] = 'iiiii' # 设置 - children--->所有子标签
# body = soup.find('body')
# v = body.children - descendants 所有后代
# body = soup.find('body')
# v = body.descendants - clear--->将标签的所有子标签全部清空(保留标签名)
# tag = soup.find('body')
# tag.clear()
# print(soup) - extract,递归的删除所有的标签,并获取删除的标签
#body = soup.find('body')
# v = body.extract()
# print(soup) - find,获取匹配的第一个标签
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag) - find_all,获取匹配的所有标签
# tags = soup.find_all('a')
# print(tags) # tags = soup.find_all('a',limit=1)
# print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags) # ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v) # v = soup.find_all(class_=['sister0', 'sister'])
# print(v) - has_attr,检查标签是否具有该属性
# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)
爬取汽车之家实例
import requests
from bs4 import BeautifulSoup
# 这个模块解析html # 下载页面
ret = requests.get(url='https://www.autohome.com.cn/news/')
# print(ret.apparent_encoding)#爬取编码格式
# print(ret.content)
# ret.encoding = 'gbk'
ret.encoding=ret.apparent_encoding
# print(ret.text) # 页面解析.获取想要的内容
soup = BeautifulSoup(ret.text,features='html.parser') # 公司用 lxml(需要单独安装) # find是匹配成功的第一个
div =soup.find(name='div',id='auto-channel-lazyload-article') #如果有class 匹配的时候:
# *****div = soup.find(name='div',attrs={'class':'dddd','id':'dfa'})***** li_list=div.find_all(name='li') # find_all返回的是一个列表 不能够用.find # print(li_list) for row in li_list:
h3=row.find(name='h3')
if not h3:
continue a=row.find(name='a')
print(a.get('href')) p = row.find(name='p')
print(p.text) li_img= row.find(name='img')
src= li_img.get('src') file_name = src.rsplit('__',maxsplit=1)[1] ret_img = requests.get('https:'+src) with open(file_name,'wb') as f:
f.write(ret_img.content)
抽屉实例
import requests
from bs4 import BeautifulSoup # 第一次访问返回未授权的cookie值
ret1 = requests.get(
url='https://dig.chouti.com/',
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
})
ret1_cookies = ret1.cookies.get_dict() # 登录成功之后cookie值已经授权
ret = requests.post(
url='https://dig.chouti.com/login',
data={
'phone':'',
'password':'wo3384451',
'oneMonth':''
},
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
},
cookies = ret1_cookies,
) for num_page in range(2,10): ret_index= requests.get(url='https://dig.chouti.com/all/hot/recent/%s'%(num_page),
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
},
)
soup = BeautifulSoup(ret_index.text,'html.parser') div = soup.find(name='div',id='content-list') item_list = div.find_all(attrs={'class':'part2'}) for item in item_list:
num = item.get('share-linkid') # 此时带着已经授权的cookie值去点赞
ret3 = requests.post(
url='https://dig.chouti.com/link/vote?linksId=%s'%(num),
# data={'linksId':'%s'%(num)},
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
},
cookies = ret1_cookies
) print(ret3.text)
Github实例
import requests
import re
from bs4 import BeautifulSoup class Github(object):
def __init__(self,username=None,passward=None):
self.username=username
self.passward=passward
self.all_cookies={} self.process() def process(self):
if not (self.username and self.passward):
raise Exception('请输入用户名和密码')
self.get_login_key() def get_login_key(self):
# 获取authenticity_token
login_result = requests.get(
url='https://github.com/login',
headers={
'Host': 'github.com',
}
)
auth_key =BS4xpath.get_auth_key(login_result.text)
self.all_cookies = login_result.cookies.get_dict()
self.login(auth_key) def login(self,auth_key):
# 登录获取已经登录的cookies
login_result = requests.post(
url='https://github.com/session',
headers={
'Upgrade-Insecure-Requests': '',
'Host': 'github.com',
},
data={
'utf8': '✓',
'authenticity_token':auth_key,
'login': self.username,
'password': self.passward,
'commit': 'Sign in'
},
cookies=self.all_cookies
)
self.all_cookies.update(login_result.cookies.get_dict())
if self.all_cookies['logged_in']=='no':
raise Exception('用户名或密码错误')
def get_msg(self):
msg_obj = requests.get(
url='https://github.com/settings/profile',
headers={
'Host': 'github.com',
'Referer': 'https://github.com/',
},
cookies=self.all_cookies
)
msg=BS4xpath.get_msg_dict(msg_obj.text) return msg class BS4xpath(object): @classmethod
def get_auth_key(self,text):
soup = BeautifulSoup(text,'html.parser')
auth_key=soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
return auth_key @classmethod
def get_msg_dict(self,text):
response = {}
ret2_data = BeautifulSoup(text,'html.parser')
div = ret2_data.find(name='div', attrs={'class': "column two-thirds"})
dl_list = div.find_all(name='dl', attrs={'class': "form-group"})
for row in dl_list:
rowname = row.find('label').text
dd_input = row.find('input')
if dd_input:
response[rowname] = dd_input.get('value')
return response obj = Github(username='a3384451',passward='wo3384451') ret = obj.get_msg()
print(ret)
拉勾网实例
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import requests all_cookie = {} # ############### 1. 查看登录页面 ###############
r1 = requests.get(
url='https://passport.lagou.com/login/login.html',
headers={
'Host': 'passport.lagou.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
) all_cookie.update(r1.cookies.get_dict()) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # ############### 2. 用户名密码登录 ###############
r2 = requests.post(
url='https://passport.lagou.com/login/login.json',
headers={
'Host': 'passport.lagou.com',
'Referer': 'https://passport.lagou.com/login/login.html',
'X-Anit-Forge-Code': X_Anti_Forge_Code,
'X-Anit-Forge-Token': X_Anti_Forge_Token,
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
},
data={
'isValidate': True,
'username': '',
'password': 'ab18d270d7126ea65915cc22c0d',
'request_form_verifyCode': '',
'submit': '', },
cookies=r1.cookies.get_dict()
) all_cookie.update(r2.cookies.get_dict()) # ############### 3. 用户授权 ###############
r3 = requests.get(
url='https://passport.lagou.com/grantServiceTicket/grant.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' },
allow_redirects=False,
cookies=all_cookie ) all_cookie.update(r3.cookies.get_dict()) # ############### 4. 用户认证 ###############
r4 = requests.get(
url=r3.headers['Location'],
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' },
allow_redirects=False,
cookies=all_cookie
) all_cookie.update(r4.cookies.get_dict()) r5 = requests.get(
url=r4.headers['Location'],
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' },
allow_redirects=False,
cookies=all_cookie
)
all_cookie.update(r5.cookies.get_dict())
r6 = requests.get(
url=r5.headers['Location'],
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' },
allow_redirects=False,
cookies=all_cookie
) all_cookie.update(r6.cookies.get_dict())
r7 = requests.get(
url=r6.headers['Location'],
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' },
allow_redirects=False,
cookies=all_cookie
) all_cookie.update(r7.cookies.get_dict()) # ############### 5. 查看个人页面 ###############
r5 = requests.get(
url='https://www.lagou.com/resume/myresume.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' },
cookies=all_cookie
)
print('武沛齐' in r5.text) # ############### 6. 查看 ###############
r6 = requests.get(
url='https://gate.lagou.com/v1/neirong/account/users/0/',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'X-L-REQ-HEADER': "{deviceType:1}",
'Origin': 'https://account.lagou.com',
'Host': 'gate.lagou.com',
},
cookies=all_cookie )
r6_json = r6.json()
all_cookie.update(r6.cookies.get_dict()) # ############### 7. 修改个人信息 ###############
r7 = requests.put(
url='https://gate.lagou.com/v1/neirong/account/users/0/',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Origin': 'https://account.lagou.com',
'Host': 'gate.lagou.com',
'X-Anit-Forge-Code': r6_json['submitCode'],
'X-Anit-Forge-Token': r6_json['submitToken'],
'X-L-REQ-HEADER': "{deviceType:1}",
},
cookies=all_cookie,
json={"userName": "wupeiqi888", "sex": "MALE", "portrait": "images/myresume/default_headpic.png",
"positionName": '...', "introduce": '....'}
)
print(r7.text)
防止xss攻击
from bs4 import BeautifulSoup
class XSSFilter(object):
__instance = None
def __init__(self): # XSS白名单
self.valid_tags = {
"font": ['color', 'size', 'face', 'style'],
'b': [],
'div': [],
"span": [],
"table": [
'border', 'cellspacing', 'cellpadding'
],
'th': [
'colspan', 'rowspan'
],
'td': [
'colspan', 'rowspan'
],
"a": ['href', 'target', 'name'],
"img": ['src', 'alt', 'title'],
'p': ['align'],
"pre": ['class'],
"hr": ['class'],
'strong': []
}
def __new__(cls, *args, **kwargs):
if not cls.__instance:
obj = object.__new__(cls, *args, **kwargs)
cls.__instance = obj
return cls.__instance
def process(self, content):
soup = BeautifulSoup(content, 'html.parser') # 遍历所有HTML标签
for tag in soup.find_all(): # 判断标签名是否在白名单中
if tag.name not in self.valid_tags:
tag.hidden = True
if tag.name not in ['html', 'body']:
tag.hidden = True
tag.clear()
continue # 当前标签的所有属性白名单
attr_rules = self.valid_tags[tag.name]
keys = list(tag.attrs.keys())
for key in keys:
if key not in attr_rules:
del tag[key]
return soup.decode() #这里返回的就是过滤完的内容 content="""
<p class='c1' id='i1'>
asdfaa<span style="font-family:NSimSun;" class='c1'>sdf<a>a</a>sdf</span>sdf
</p>
<p>
<strong class='c2' id='i2'>asdf</strong>
<script>alert(123)</script>
</p>
<h2>
asdf
</h2>
""" content = XSSFilter().process(content)
print('content',content)
总结:
- 如果爬取的网站有反爬措施,请求里模仿浏览器发给服务器端
- 如果需要需要携带信息过去的
- 去服务器返回的内容里找.如果有将他格式化成字典或其他保存在session
- 看到159900098这样格式的一般都是时间戳,但是位数需要自己 观察
- 如果服务器返回的内容里没有key,那么去html或者js找相应的数据
- 可能下一次的操作需要携带着上一次服务器发过来的key或其他
- 状态码:
- 3开头的状态码是自动跳转.在自动跳转的时候可能进行cookies认证
- 注意Response request 里的set-cookies参数
参考:http://www.cnblogs.com/wupeiqi/articles/6283017.html
官方文档:http://cn.python-requests.org/zh_CN/latest/user/quickstart.html#id4
python爬虫之request and BeautifulSoup的更多相关文章
-
python爬虫(7)——BeautifulSoup
今天介绍一个非常好用的python爬虫库--beautifulsoup4.beautifulsoup4的中文文档参考网址是:http://beautifulsoup.readthedocs.io/zh ...
-
python爬虫数据解析之BeautifulSoup
BeautifulSoup是一个可以从HTML或者XML文件中提取数据的python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式. BeautfulSoup是python爬虫三 ...
-
python爬虫入门四:BeautifulSoup库(转)
正则表达式可以从html代码中提取我们想要的数据信息,它比较繁琐复杂,编写的时候效率不高,但我们又最好是能够学会使用正则表达式. 我在网络上发现了一篇关于写得很好的教程,如果需要使用正则表达式的话,参 ...
-
python爬虫07 | 有了 BeautifulSoup ,妈妈再也不用担心我的正则表达式了
我们上次做了 你的第一个爬虫,爬取当当网 Top 500 本五星好评书籍 有些朋友觉得 利用正则表达式去提取信息 太特么麻烦了 有没有什么别的方式 更方便过滤我们想要的内容啊 emmmm 你还别说 还 ...
-
Python爬虫学习三------requests+BeautifulSoup爬取简单网页
第一次第一次用MarkDown来写博客,先试试效果吧! 昨天2018俄罗斯世界杯拉开了大幕,作为一个伪球迷,当然也得为世界杯做出一点贡献啦. 于是今天就编写了一个爬虫程序将腾讯新闻下世界杯专题的相关新 ...
-
Python 爬虫之request+beautifulsoup+mysql
一.什么是爬虫?它是指向网站发起请求,获取资源后分析并提取有用数据的程序:爬虫的步骤: 1.发起请求使用http库向目标站点发起请求,即发送一个RequestRequest包含:请求头.请求体等 2. ...
-
Python爬虫学习之使用beautifulsoup爬取招聘网站信息
菜鸟一只,也是在尝试并学习和摸索爬虫相关知识. 1.首先分析要爬取页面结构.可以看到一列搜索的结果,现在需要得到每一个链接,然后才能爬取对应页面. 关键代码思路如下: html = getHtml(& ...
-
python爬虫学习之使用BeautifulSoup库爬取开奖网站信息-模块化
实例需求:运用python语言爬取http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html这个开奖网站所有的信息,并且保存为txt文件和excel文件. 实 ...
-
python爬虫(5)——BeautifulSoup &; docker基础
BeautifulSoup基础实战 安装:pip install beautifulsoup4 常用指令: from bs4 import BeautifulSoup as bs import url ...
随机推荐
-
nodejs review-02
30 Receive POST data POST接受JSON数据处理; //req. res都是可读的stream; http.createServer(function (req, res) { ...
-
Linux的文件/目录访问权限
一直以为对这个概念非常懂,但这次还是犯了眼高手低的毛病. 配置服务器遇到了一个问题,对某个WEB目录(例如"/bin"),有两个用户要对其进行读写操作: 首先apache服务器要对 ...
-
使用ecshop电子商务系统的100个小问题
1:如何修改网站"欢迎光临本店" 回答:languages\zh_cn\common.php文件中, $_LANG['welcome'] = '欢迎光临本店';将他修改成你需要的字 ...
-
Javassist进行方法插桩
javassist官网 http://jboss-javassist.github.io/javassist/ javassist API网 http://jboss-javassist.github ...
-
Eralng的常用数据结构
1.记录(record) 适用于小数据,并且用属性名方便查找 2.Key/Value 类型 a.属性列表 就是类似[{Key, Value}]的列表,可以通过proplists模块来处理这样的列表 当 ...
-
python第一百零八天---Django 3 session 操作
上节内容回顾: 1.请求周期 url> 路由 > 函数或类 > 返回字符串或者模板语言? Form表单提交: 提交 -> url > 函数或类中的方法 - .... Ht ...
-
ORACLE报错和解决方案
ORA-01034: ORACLE not available ORA-27101 出现ORA-01034和ORA-27101的原因是多方面的:主要是oracle当前的服务不可用,shared mem ...
-
一口一口吃掉Volley(二)
欢迎访问我的个人博客转发请注明出处:http://www.wensibo.top/2017/02/17/一口一口吃掉Volley(二)/ 相信看了第一篇教程之后,你应该会对Volley有一个初步的了解 ...
-
where常用运算符
mysql查询的五种子句:where(条件查询).having(筛选).group by(分组).order by(排序).limit(限制结果数) where常用运算符:比较运算符> , &l ...
-
epoll—IO多路复用
1.在socket.listen()后创一个epoll对象 epoll = select.epoll() 2.将server_socket注册到epoll中 epoll.regist ...