思路分析:
1.检查数据是否在页面源码或框架源码中
2.数据不在页面源码中说明网站通过二次加载的形式绑定数据到页面
3.抓取二次加载链接,获取它的加载方式,这里看到post形式,找到传递的所需要参数 (encText,encSecKey)
4.参考网站加密逻辑进行将参数加密(参考浏览器 Call Stack,观察参数加密前后变化使用了哪一个函数过程)
5.使用加密后参数向网站进行请求,抓取内容并下载
实战案例:
# Demo Describe:综合训练-网易云音乐评论抓取
import json
import requests
from Crypto.Cipher import AES
from base64 import b64encode
from fake_useragent import UserAgent
'''
本次观察到的加密方法:
window.asrsea(JSON.stringify(i2x), buV3x(["流泪", "强"]), buV3x(Rg7Z.md), buV3x(["爱心", "女孩", "惊恐", "大笑"]))
function a(a) {
var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
for (d = 0; a > d; d += 1)
e = Math.random() * b.length,
e = Math.floor(e),
c += b.charAt(e);
return c
}
function b(a, b) {
var c = CryptoJS.enc.Utf8.parse(b)
, d = CryptoJS.enc.Utf8.parse("0102030405060708")
, e = CryptoJS.enc.Utf8.parse(a)
, f = CryptoJS.AES.encrypt(e, c, {
iv: d,
mode: CryptoJS.mode.CBC
});
return f.toString()
}
function c(a, b, c) {
var d, e;
return setMaxDigits(131),
d = new RSAKeyPair(b,"",c),
e = encryptedString(d, a)
}
function d(d, e, f, g) {
var h = {}
, i = a(16); # 随机数
return h.encText = b(d, g),
h.encText = b(h.encText, i), # encText
h.encSecKey = c(i, e, f), # encSecKey
h
}
function e(a, b, d, e) {
var f = {};
return f.encText = c(a + e, b, d),
f
}
'''
# 本次抓取‘騎士王の誇り’的热门评论
url = 'https://music.163.com/weapi/comment/resource/comments/get?csrf_token='
data = {
"csrf_token": "",
"cursor": "-1",
"offset": "0",
"orderType": "1",
"pageNo": "1",
"pageSize": "20",
"rid": "R_SO_4_448119",
"threadId": "R_SO_4_448119"
}
# d, e, f, g
e = '010001',
f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e4' \
'17629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cc' \
'e10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7',
g = '0CoJUm6Qyw8W8jud'
i = 'TA0sm9lgTIxNfOhl'
def GetEncSecKey():
return 'c66a9f40ff5ecca6999d183c013246951c9d07205228051b48ad3398d99e8d16aa516a6544a9022fdf2547df' \
'a04450b908b7da4cd756881eab04ed01aa9d79bb051ef58755f40f7d2a8dd44fb461c73fc3978f6b16417d077c8' \
'57cb87a1fa8c152ebb208db4d9dadd8a967fad492561da879f37a2fadc2b00115598f1568d6f7'
def GetEncText(data):
first = EncParams(data, g)
second = EncParams(first, i)
return second
def To_16(data):
pad = 16 - len(data) % 16
data += chr(pad) * pad
return data
def EncParams(data, key): # 加密过程-仿制网站加密过程,参照上方函数 function b(a, b)
iv = '0102030405060708'
aes = AES.new(key=key.encode('utf-8'), IV=iv.encode('utf-8'), mode=AES.MODE_CBC)
bs = aes.encrypt(To_16(data).encode('utf-8')) # AES加密简单逻辑-长度固定为16位 表达式简单写为 n~m字符+chr(n~m字符个数)
return str(b64encode(bs), 'utf-8') # 这里的 bs 经过AES加密后不可以直接转换为utf-8,需要转为64进制后再转换为utf-8
ua = UserAgent()
user_agent = ua.random
headers = {'user-agent': user_agent}
resp = requests.post(url, data={
'params': GetEncText(json.dumps(data)),
'encSecKey': GetEncSecKey()
}, headers=headers)
print(resp.json())