python网络爬虫之requests模块
- session处理cookie
- proxies参数设置请求代理ip
- 基于线程池的数据爬取
一 获取验证码
步骤:
1 注册云大码 http://www.yundama.com/about.html
2登录 开发者登录 和用户者登录
3 在开发者登录成功之后,创建新软件
4点击开发者中心
5 点击进入Pythonhttp下载
6 选择所需的版本下载即可
获取验证码
import http.client, mimetypes, urllib, json, time, requests
######################################################################
class YDMHttp:
apiurl = 'http://api.yundama.com/api.php'
username = ''
password = ''
appid = ''
appkey = ''
def __init__(self, username, password, appid, appkey):
self.username = username
self.password = password
self.appid = str(appid)
self.appkey = appkey
def request(self, fields, files=[]):
response = self.post_url(self.apiurl, fields, files)
response = json.loads(response)
return response
def balance(self):
data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['balance']
else:
return -9001
def login(self):
data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['uid']
else:
return -9001
def upload(self, filename, codetype, timeout):
data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
file = {'file': filename}
response = self.request(data, file)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['cid']
else:
return -9001
def result(self, cid):
data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
response = self.request(data)
return response and response['text'] or ''
def decode(self, filename, codetype, timeout):
cid = self.upload(filename, codetype, timeout)
if (cid > 0):
for i in range(0, timeout):
result = self.result(cid)
if (result != ''):
return cid, result
else:
time.sleep(1)
return -3003, ''
else:
return cid, ''
def report(self, cid):
data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
response = self.request(data)
if (response):
return response['ret']
else:
return -9001
def post_url(self, url, fields, files=[]):
for key in files:
files[key] = open(files[key], 'rb');
res = requests.post(url, files=files, data=fields)
return res.text
######################################################################
# 云打码中的用户名(普通用户)
username = 'molihua'
# 云打码中的密码
password = 'MLH19960208'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 7025
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = '2d96c723a682c882faa73257e98440d7 '
# 图片文件
filename = 'getimage.jpg'
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 1004
# 超时时间,秒 自定义
timeout = 10
# 检查
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = YDMHttp(username, password, appid, appkey)
# 登陆云打码
uid = yundama.login();
print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(filename, codetype, timeout);
print('cid: %s, result: %s' % (cid, result))
需求 通过登录人人网来获取主页面
import requests
import urllib
from lxml import etree
#获取session对象
session=requests.Session()
url='http://www.renren.com'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:65.0) Gecko/20100101 Firefox/65.0'
}
renren_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(renren_text)
code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
urllib.requests.urlretrieve(url=code_img_url,filename='code.jpg')
#识别验证码图片中的数据值
code_data = getCodeDate('15204558261','MLH19960208','./code.jpg',2004)
print(code_data)
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019142013687'
##通过抓包工具来获取
data ={
'email':'15204558261'
'icode': code_data
'origURL':'http://www.renren.com/home'
'domain':'renren.com'
'key_id':'1'
'captcha_type':'web_login'
'password':'7bf638cc5b01b15b9416bf17fb98a1eda46da861c139b563a4c670fb21884336'
'rkey':'cf9180c5afba43cb1f089b953e67b567'
'f':'http%3A%2F%2Fwww.renren.com%2F296856777%2Fprofile'
}
#该次请求产生的cookie会被自动存储到session对象中
session.post(url=login_url,data=data,headers=headers)
url='http://www.renren.com/296856777/profile'
page_text = session.get(url=url,headers=headers).text
with open('renren.html','w',encoding='utf-8') as fp:
fp.write(page_text)
基于multiprocessing.dummy线程池的数据爬取
import requests
import random
from lxml import etree
import re
from fake_useragent import UserAgent
#安装fake-useragent库:pip install fake-useragent
#导入线程池模块
from multiprocessing.dummy import Pool
#实例化线程池对象
pool = Pool()
url = 'http://www.pearvideo.com/category_1'
#随机产生UA
ua = UserAgent().random
headers = {
'User-Agent':ua
}
#获取首页页面数据
page_text = requests.get(url=url,headers=headers).text
#对获取的首页页面数据中的相关视频详情链接进行解析
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
detail_urls = []#存储二级页面的url
for li in li_list:
detail_url = 'http://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
title = li.xpath('.//div[@class="vervideo-title"]/text()')[0]
detail_urls.append(detail_url)
vedio_urls = []#存储视频的url
for url in detail_urls:
page_text = requests.get(url=url,headers=headers).text
vedio_url = re.findall('srcUrl="(.*?)"',page_text,re.S)[0]
vedio_urls.append(vedio_url)
#使用线程池进行视频数据下载
func_request = lambda link:requests.get(url=link,headers=headers).content
video_data_list = pool.map(func_request,vedio_urls)
#使用线程池进行视频数据保存
func_saveData = lambda data:save(data)
pool.map(func_saveData,video_data_list)
def save(data):
fileName = str(random.randint(1,10000))+'.mp4'
with open(fileName,'wb') as fp:
fp.write(data)
print(fileName+'已存储')
pool.close()
pool.join()
requests模块的代理
什么是代理???
代理就是第三方代替本体来处理相关的事务。列如:中介,微商,代购等
那么问题来了,爬虫为什么要用到代理呢?
这是因为有一些网站它会采取相关的反爬措施。列如一些网站会通过检测某一段时间某个IP访问的次数,如果访问太过于频繁,那么它就会知道可能不是正常的用户,会禁止掉这个IP的访问。所以我们可以使用代理IP来爬取我们需要的数据,就算是某一个IP一段时间被禁止掉,也可以换另一个代理
IP去爬取数据。是不是觉得非常的神奇呢
代理的分类:
正向代理:代理客户端获取数据
反向代理:代理服务端提供数据
免费代理IP的网站:
http://www.goubanjia.com/
快代理
import requests
import random
if __name__ == "__main__":
#不同浏览器的UA
header_list = [
# 遨游
{"user-agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"},
# 火狐
{"user-agent": "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
# 谷歌
{
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
]
#不同的代理IP
proxy_list = [
{"http": "112.115.57.20:3128"},
{'http': '121.41.171.223:3128'}
]
#随机获取UA和代理IP
header = random.choice(header_list)
proxy = random.choice(proxy_list)
url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
#参数3:设置代理
response = requests.get(url=url,headers=header,proxies=proxy)
response.encoding = 'utf-8'
with open('daili.html', 'wb') as fp:
fp.write(response.content)
#切换成原来的IP
requests.get(url, proxies={"http": ""})