简易爬虫(Python, Javascript)

时间:2024-03-04 17:20:47

Python

预先配置好Python环境和requests, beautifulsoup4, lxml包.

Step 1 模拟登录

打开开发者工具, 手动登录, 然后在Network中找到登录网址(Request Method为POST)及登录表. 为防止页面刷新过快, 可勾选Preserve log.

使用下述Python代码模拟登录.

import requests

s = requests.Session()

form = {
    \'username\': \'\',
    \'password\': \'\',
}
s.post(loginUrl, form)

Step 2 查找结点

from bs4 import BeautifulSoup

r = s.get(url) # 待爬网页
r.encoding = \'gbk\' # 网页编码

soup = BeautifulSoup(r.text, \'lxml\')

# find示例
title = soup.find(\'h1\').text

# find_all示例, 其中目标结点结构为<dl class=\'attachlist\'><dt><a href=fileUrl>filename</a></dt></dl>
dls = soup.find_all(\'dl\', class_ = \'attachlist\')
for dl in dls:
    filename = dl.dt.a.text
    fileUrl = baseUrl + dl.dt.a.get(\'href\')

Step 3 下载

# Modified from https://www.jianshu.com/p/e3444c52c043
def download(url, s, filename):
    import urllib, os
    # filename = urllib.parse.unquote(url)
    # filename = filename[filename.rfind(\'/\') + 1:]
    try:
        r = s.get(url, stream=True, timeout = 2)
        chunk_size = 1000
        timer = 0
        length = int(r.headers[\'Content-Length\'])
        print(\'Downloading {}\'.format(filename))
        if os.path.isfile(\'./\' + filename):
                    print(\'  File already exist, skipped\')
                    return False
        with open(\'./\' + filename, \'wb\') as f:
            for chunk in r.iter_content(chunk_size):
                timer += chunk_size
                percent = round(timer/length, 4) * 100
                print(\'\r {:4f}\'.format((percent)), end = \'\')
                f.write(chunk)
        print(\'\r  Finished    \')
        return True
    except requests.exceptions.ReadTimeout:
        print(\'Read time out, this file failed to download\')
        return False
    except requests.exceptions.ConnectionError:
        print(\'ConnectionError, this file failed to download\')
        return False

Javascript

  • 用Javascript比Python更灵活方便. 不需要配置编程环境, 也不需要模拟登录, 在浏览器上手动登录后就地运行脚本即可, 并且可以在开发者工具 - Console里实时测试脚本.

  • 可以将脚本保存在油猴插件中以便重复使用. 注意如果待爬网站不自带jQuery, 需要在油猴插件的脚本中手动导入, 例如

// @require      https://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js

这里帖出一个知轩藏书的爬虫

// ==UserScript==
// @name         知轩藏书爬虫
// @namespace    http://tampermonkey.net/
// @version      0.1
// @description  try to take over the world!
// @author       You
// @include      *zxcs.me*
// @grant        none
// @require      https://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js
// ==/UserScript==

const booksPerPage = 15;
const output = [];

let booksFinished = 0, booksTotal = 0;
let percentage = 0;
let progress;

function updateProgress() {
    var p = Math.floor(booksFinished / booksTotal * 100);
    if (p != percentage) {
        percentage = p;
        progress.text(p);
    }
}

/**
 * 搜索目标字符串
 * @param {string} str 目标字符串
 * @param {regexp} re 正则表达式
 * @param {any} defau 匹配失败返回值
 */
function searchString(str, re, defau) {
    var matches = str.match(re);
    return matches === null ? defau : matches[0];
}

/**
 * 将output[]保存为`${section}.json`
 * @param {string} section 板块名
 */
function download(section) {
    var a = document.createElement(\'a\');
    var e = document.createEvent(\'MouseEvents\');
    e.initEvent(\'click\', false, false);
    a.download = section + \'.json\';
    var blob = new Blob([JSON.stringify(output)]);
    a.href = URL.createObjectURL(blob);
    a.dispatchEvent(e);
}

/**
 * 获取指定书籍的基本信息(书名、分类、大小、简介)
 * @param {object} book 书籍对象
 * @returns promise
 */
async function getInfo(book) {
    let response = await fetch(`http://www.zxcs.me/post/${book.id}`);
    let result = await response.text();
    result = $(result);
    var ps = result.find(\'div#content > p\');
    if (ps.length < 3) throw new Error(`getInfo: ps.length < 3`);
    var arr = ps.eq(0).text().trim().split(/\s+/);
    if (arr.length < 2) throw new Error(`getInfo: arr.length < 2`);
    var detail = ps.eq(2).text().replace(/\s/g, \'\');

    book.title     = result.find(\'div#content h1\').text();
    book.catogory1 = arr[arr.length - 2];
    book.catogory2 = arr[arr.length - 1];
    book.size      = searchString(detail, /[\d\.]+[MK]B/, \'?\');
    book.intro     = detail.replace(/^.*【内容简介】:/, \'\');
}

/**
 * 获取指定书籍的评价(仙粮干枯毒)
 * @param {object} book 书籍对象
 * @returns promise
 */
async function getVotes(book) {
    let response = await fetch(`http://www.zxcs.me/content/plugins/cgz_xinqing/cgz_xinqing_action.php?action=show&id=${book.id}&m=${Math.random()}`);
    let result = await response.text();
    var arr = result.split(\',\').map(x => parseInt(x));
    if (arr.length !== 5) throw new Error(`getVotes: arr.length !== 5`);

    book.xian  = arr[0];
    book.liang = arr[1];
    book.gan   = arr[2];
    book.ku    = arr[3];
    book.du    = arr[4];
}

/**
 * 获取指定书籍的下载链接
 * @param {object} book 书籍对象
 * @returns promise
 */
async function getUrl(book) {
    let response = await fetch(`http://www.zxcs.me/download.php?id=${book.id}`);
    let result = await response.text();
    var h = $(result).find(\'.downfile a\').first().prop(\'href\');
    if (h === undefined) throw new Error(`getUrl: h === undefined`);

    book.url = h;
}

/**
 * 获取指定书籍的所有信息
 * @param {object} book 书籍对象
 * @returns promise
 */
async function parseBook(book) {
    try {
        await getInfo(book);
        await getVotes(book);
        await (getUrl(book));
    } catch (err) {
        console.log(book, err.message);
    } finally {
        booksFinished++;
        updateProgress();
    }
}

/**
 * 获取指定页面内所有书籍的信息
 * @param {string} url 页面网址
 * @param {number} offset 与起始页码间的偏移
 * @returns promise
 */
async function parsePage(url, offset) {
    try {
        let response = await fetch(url);
        let result = await response.text();
        var arr = [];
        $(result).find(\'dl#plist dt a\').each((i, a) => {
            var book = output[offset * booksPerPage + i] = {
                id: parseInt($(a).attr(\'href\').replace(/^.*\//, \'\'))
            };
            arr.push(parseBook(book));
        });
        await Promise.all(arr);
    } catch (err) {
        console.log(url, err.message);
    }
}

$(function () {
    console.log(\'知轩藏书爬虫正在运行\');

    let baseUrl = $(\'div#pagenavi a\').last().attr(\'href\');
    if (baseUrl) {
        let pagesTotal = searchString(baseUrl, /\d+$/, null);
        if (pagesTotal === null) return;
        else pagesTotal = parseInt(pagesTotal);

        booksTotal = pagesTotal * booksPerPage;

        baseUrl = baseUrl.replace(/\d+$/, \'\');
        let section = $(\'div#ptop\').text().trim().split(/\s+/).pop();

        if (confirm(`找到${pagesTotal}个页面,是否爬取【${section}】板块下的书籍信息?`)) {
            var input = prompt(`输入待爬取页码,格式为"起始页码 终止页码"`, `1 ${pagesTotal}`);
            if (input == null) return;

            var inputInts = input.trim().split(/\s+/).map(str => parseInt(str));
            if (inputInts.length !== 2
                || inputInts[0] < 1 || inputInts[1] > pagesTotal
                || inputInts[0] > inputInts[1]) {
                alert(\'页码格式不合法!\');
                return;
            }

            $(\'body\').append(`<div id=\'temp-prog\' style=\'position: fixed; left: 2%; top: 2%; width: 48px; height: 48px; border-radius: 24px; text-align: center; color: white; background: red; font-size: 25px; line-height: 48px; font-family: monospace; box-shadow: 2px 2px 5px 2px #d6d6d6;\'>0</div>`);
            progress = $(\'#temp-prog\');

            (async () => {
                for (var i = inputInts[0]; i <= inputInts[1]; ++i) {
                    await parsePage(baseUrl + i, i - inputInts[0]);
                }
                download(section);
                progress.remove();
            })();
        }
    }
});

爬取结果会自动保存为一个json文件. 之后可以随便找一个在线json转Excel的网站将之转化为Excel表格(例如JSON 转换 Excel - 在线小工具). 效果如下

点击这里直接获取爬取数据 - Onedrive

另:获得书籍仙粮率95%的置信上下界,以下界为关键字可获得较好的排序。

const output = [];

const z = 1.96;

/**
 * 获得书籍仙粮率95%的置信上下界
 * @param {object} book 书籍
 */
function getEstimate(book) {
    var xl = book.xian + book.liang;
    var n = xl + book.gan + book.ku + book.du;

    if (isNaN(xl) || isNaN(n) || xl < 5 || xl > n - 5) {
        book.lower = book.upper = 0;
        return;
    }

    var p = xl / n;
    var a = 1 + z**2 / n;
    var b = 2 * p + z**2 / n;
    var c = p**2;
    book.lower = (b - Math.sqrt(b**2 - 4*a*c)) / (2*a);
    book.upper = (b + Math.sqrt(b**2 - 4*a*c)) / (2*a);
}

async function parseJson(filename) {
    let response = await fetch(filename);
    let result = await response.text();
    result = JSON.parse(result);
    for (var book of result) {
        getEstimate(book);
        output.push(book);
    }
}

const files = [\'二次元.json\', \'奇幻·玄幻.json\', \'科幻·灵异.json\', \'都市·娱乐.json\', \'历史·军事.json\', \'武侠·仙侠.json\', \'竞技·游戏.json\'];

function download(obj, name) {
    var a = document.createElement(\'a\');
    var e = document.createEvent(\'MouseEvents\');
    e.initEvent(\'click\', false, false);
    a.download = `${name}.json`;
    var blob = new Blob([JSON.stringify(obj)]);
    a.href = URL.createObjectURL(blob);
    a.dispatchEvent(e);
}

(async () => {
    for (var file of files) {
        await parseJson(file);
    }

    download(output, \'all\');
})();

另附简单的网页版