Python
预先配置好Python环境和requests
, beautifulsoup4
, lxml
包.
Step 1 模拟登录
打开开发者工具, 手动登录, 然后在Network中找到登录网址(Request Method为POST)及登录表. 为防止页面刷新过快, 可勾选Preserve log.
使用下述Python代码模拟登录.
import requests
s = requests.Session()
form = {
\'username\': \'\',
\'password\': \'\',
}
s.post(loginUrl, form)
Step 2 查找结点
from bs4 import BeautifulSoup
r = s.get(url) # 待爬网页
r.encoding = \'gbk\' # 网页编码
soup = BeautifulSoup(r.text, \'lxml\')
# find示例
title = soup.find(\'h1\').text
# find_all示例, 其中目标结点结构为<dl class=\'attachlist\'><dt><a href=fileUrl>filename</a></dt></dl>
dls = soup.find_all(\'dl\', class_ = \'attachlist\')
for dl in dls:
filename = dl.dt.a.text
fileUrl = baseUrl + dl.dt.a.get(\'href\')
Step 3 下载
# Modified from https://www.jianshu.com/p/e3444c52c043
def download(url, s, filename):
import urllib, os
# filename = urllib.parse.unquote(url)
# filename = filename[filename.rfind(\'/\') + 1:]
try:
r = s.get(url, stream=True, timeout = 2)
chunk_size = 1000
timer = 0
length = int(r.headers[\'Content-Length\'])
print(\'Downloading {}\'.format(filename))
if os.path.isfile(\'./\' + filename):
print(\' File already exist, skipped\')
return False
with open(\'./\' + filename, \'wb\') as f:
for chunk in r.iter_content(chunk_size):
timer += chunk_size
percent = round(timer/length, 4) * 100
print(\'\r {:4f}\'.format((percent)), end = \'\')
f.write(chunk)
print(\'\r Finished \')
return True
except requests.exceptions.ReadTimeout:
print(\'Read time out, this file failed to download\')
return False
except requests.exceptions.ConnectionError:
print(\'ConnectionError, this file failed to download\')
return False
Javascript
-
用Javascript比Python更灵活方便. 不需要配置编程环境, 也不需要模拟登录, 在浏览器上手动登录后就地运行脚本即可, 并且可以在开发者工具 - Console里实时测试脚本.
-
可以将脚本保存在油猴插件中以便重复使用. 注意如果待爬网站不自带jQuery, 需要在油猴插件的脚本中手动导入, 例如
// @require https://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js
这里帖出一个知轩藏书的爬虫
// ==UserScript==
// @name 知轩藏书爬虫
// @namespace http://tampermonkey.net/
// @version 0.1
// @description try to take over the world!
// @author You
// @include *zxcs.me*
// @grant none
// @require https://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js
// ==/UserScript==
const booksPerPage = 15;
const output = [];
let booksFinished = 0, booksTotal = 0;
let percentage = 0;
let progress;
function updateProgress() {
var p = Math.floor(booksFinished / booksTotal * 100);
if (p != percentage) {
percentage = p;
progress.text(p);
}
}
/**
* 搜索目标字符串
* @param {string} str 目标字符串
* @param {regexp} re 正则表达式
* @param {any} defau 匹配失败返回值
*/
function searchString(str, re, defau) {
var matches = str.match(re);
return matches === null ? defau : matches[0];
}
/**
* 将output[]保存为`${section}.json`
* @param {string} section 板块名
*/
function download(section) {
var a = document.createElement(\'a\');
var e = document.createEvent(\'MouseEvents\');
e.initEvent(\'click\', false, false);
a.download = section + \'.json\';
var blob = new Blob([JSON.stringify(output)]);
a.href = URL.createObjectURL(blob);
a.dispatchEvent(e);
}
/**
* 获取指定书籍的基本信息(书名、分类、大小、简介)
* @param {object} book 书籍对象
* @returns promise
*/
async function getInfo(book) {
let response = await fetch(`http://www.zxcs.me/post/${book.id}`);
let result = await response.text();
result = $(result);
var ps = result.find(\'div#content > p\');
if (ps.length < 3) throw new Error(`getInfo: ps.length < 3`);
var arr = ps.eq(0).text().trim().split(/\s+/);
if (arr.length < 2) throw new Error(`getInfo: arr.length < 2`);
var detail = ps.eq(2).text().replace(/\s/g, \'\');
book.title = result.find(\'div#content h1\').text();
book.catogory1 = arr[arr.length - 2];
book.catogory2 = arr[arr.length - 1];
book.size = searchString(detail, /[\d\.]+[MK]B/, \'?\');
book.intro = detail.replace(/^.*【内容简介】:/, \'\');
}
/**
* 获取指定书籍的评价(仙粮干枯毒)
* @param {object} book 书籍对象
* @returns promise
*/
async function getVotes(book) {
let response = await fetch(`http://www.zxcs.me/content/plugins/cgz_xinqing/cgz_xinqing_action.php?action=show&id=${book.id}&m=${Math.random()}`);
let result = await response.text();
var arr = result.split(\',\').map(x => parseInt(x));
if (arr.length !== 5) throw new Error(`getVotes: arr.length !== 5`);
book.xian = arr[0];
book.liang = arr[1];
book.gan = arr[2];
book.ku = arr[3];
book.du = arr[4];
}
/**
* 获取指定书籍的下载链接
* @param {object} book 书籍对象
* @returns promise
*/
async function getUrl(book) {
let response = await fetch(`http://www.zxcs.me/download.php?id=${book.id}`);
let result = await response.text();
var h = $(result).find(\'.downfile a\').first().prop(\'href\');
if (h === undefined) throw new Error(`getUrl: h === undefined`);
book.url = h;
}
/**
* 获取指定书籍的所有信息
* @param {object} book 书籍对象
* @returns promise
*/
async function parseBook(book) {
try {
await getInfo(book);
await getVotes(book);
await (getUrl(book));
} catch (err) {
console.log(book, err.message);
} finally {
booksFinished++;
updateProgress();
}
}
/**
* 获取指定页面内所有书籍的信息
* @param {string} url 页面网址
* @param {number} offset 与起始页码间的偏移
* @returns promise
*/
async function parsePage(url, offset) {
try {
let response = await fetch(url);
let result = await response.text();
var arr = [];
$(result).find(\'dl#plist dt a\').each((i, a) => {
var book = output[offset * booksPerPage + i] = {
id: parseInt($(a).attr(\'href\').replace(/^.*\//, \'\'))
};
arr.push(parseBook(book));
});
await Promise.all(arr);
} catch (err) {
console.log(url, err.message);
}
}
$(function () {
console.log(\'知轩藏书爬虫正在运行\');
let baseUrl = $(\'div#pagenavi a\').last().attr(\'href\');
if (baseUrl) {
let pagesTotal = searchString(baseUrl, /\d+$/, null);
if (pagesTotal === null) return;
else pagesTotal = parseInt(pagesTotal);
booksTotal = pagesTotal * booksPerPage;
baseUrl = baseUrl.replace(/\d+$/, \'\');
let section = $(\'div#ptop\').text().trim().split(/\s+/).pop();
if (confirm(`找到${pagesTotal}个页面,是否爬取【${section}】板块下的书籍信息?`)) {
var input = prompt(`输入待爬取页码,格式为"起始页码 终止页码"`, `1 ${pagesTotal}`);
if (input == null) return;
var inputInts = input.trim().split(/\s+/).map(str => parseInt(str));
if (inputInts.length !== 2
|| inputInts[0] < 1 || inputInts[1] > pagesTotal
|| inputInts[0] > inputInts[1]) {
alert(\'页码格式不合法!\');
return;
}
$(\'body\').append(`<div id=\'temp-prog\' style=\'position: fixed; left: 2%; top: 2%; width: 48px; height: 48px; border-radius: 24px; text-align: center; color: white; background: red; font-size: 25px; line-height: 48px; font-family: monospace; box-shadow: 2px 2px 5px 2px #d6d6d6;\'>0</div>`);
progress = $(\'#temp-prog\');
(async () => {
for (var i = inputInts[0]; i <= inputInts[1]; ++i) {
await parsePage(baseUrl + i, i - inputInts[0]);
}
download(section);
progress.remove();
})();
}
}
});
爬取结果会自动保存为一个json
文件. 之后可以随便找一个在线json转Excel的网站将之转化为Excel表格(例如JSON 转换 Excel - 在线小工具). 效果如下
另:获得书籍仙粮率95%的置信上下界,以下界为关键字可获得较好的排序。
const output = [];
const z = 1.96;
/**
* 获得书籍仙粮率95%的置信上下界
* @param {object} book 书籍
*/
function getEstimate(book) {
var xl = book.xian + book.liang;
var n = xl + book.gan + book.ku + book.du;
if (isNaN(xl) || isNaN(n) || xl < 5 || xl > n - 5) {
book.lower = book.upper = 0;
return;
}
var p = xl / n;
var a = 1 + z**2 / n;
var b = 2 * p + z**2 / n;
var c = p**2;
book.lower = (b - Math.sqrt(b**2 - 4*a*c)) / (2*a);
book.upper = (b + Math.sqrt(b**2 - 4*a*c)) / (2*a);
}
async function parseJson(filename) {
let response = await fetch(filename);
let result = await response.text();
result = JSON.parse(result);
for (var book of result) {
getEstimate(book);
output.push(book);
}
}
const files = [\'二次元.json\', \'奇幻·玄幻.json\', \'科幻·灵异.json\', \'都市·娱乐.json\', \'历史·军事.json\', \'武侠·仙侠.json\', \'竞技·游戏.json\'];
function download(obj, name) {
var a = document.createElement(\'a\');
var e = document.createEvent(\'MouseEvents\');
e.initEvent(\'click\', false, false);
a.download = `${name}.json`;
var blob = new Blob([JSON.stringify(obj)]);
a.href = URL.createObjectURL(blob);
a.dispatchEvent(e);
}
(async () => {
for (var file of files) {
await parseJson(file);
}
download(output, \'all\');
})();
另附简单的网页版