pyspider示例代码六：传递参数

传递参数

示例一

#!/usr/bin/env python

# -*- encoding: utf- -*-

# vim: set et sw= ts= sts= ff=unix fenc=utf8:

# Created on -- ::

import re

import json

from libs.pprint import pprint

from libs.base_handler import *

class Handler(BaseHandler):

    '''

    this is a sample handler

    '''

    crawl_config = {

    }

    proxy = ""

    @every(, )

    def on_start(self):

        self.crawl(self.proxy+'http://www.douban.com/group/haixiuzu/discussion',

                   force_update=True, callback=self.index_page)

    @config(age=)

    def index_page(self, response):

        for each in response.doc('tr > .title > a').items():

            self.crawl(self.proxy+each.attr.href, callback=self.detail_page)

    @config(age=***)

    def detail_page(self, response):

        assert response.url != "https://www.douban.com/"

        return {

            "url": response.url,

            "title": response.doc("#content h1").text(),

            "author": response.doc(".topic-content .from a").text(),

            "author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href,

            "imgs": [x.attr.src for x in response.doc('.topic-doc img').items()]

        }

    def on_result(self, result):

        if not result or not result['imgs']:

            return

        post_id = re.search("topic/(\d+)", self.response.url).group()

        self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST",

            data={

            "short_name": "database",

            "secret": "8e5a5be8873ad7e9a59147c3cfd10e73",

            "posts[0][post_key]": post_id,

            "posts[0][thread_key]": "haixiuzu",

            "posts[0][message]": json.dumps(result).encode("base64").replace("\n", "")

        }, callback=self.post_to_duoshuo)

    def post_to_duoshuo(self):

        pass

示例二

#!/usr/bin/env python

# -*- encoding: utf- -*-

# Created on -- ::

# Project: prieto

import re

from pyspider.libs.base_handler import *

class Handler(BaseHandler):

    crawl_config = {

    }

    @every(minutes= * )

    def on_start(self):

        for i in range():

            self.crawl('data:,step%d' % i, callback=self.gen_url, save=i) 

    @config(priority=)

    def gen_url(self, respond):

        for i in range(respond.save * , (respond.save + ) * ):

            self.crawl("http://bbs.fobshanghai.com/viewthread.php?action=printable&tid=%d" % i, callback=self.index_page) 

    @config(priority=)

    def index_page(self, respond): 

        # title = response.doc

        hr_black = u'<hr noshade="noshade" size="2" width="100%" color="#808080"/>'

        hr_blue = u'<br/><br/><br/><br/><hr noshade="noshade" size="2" width="100%" color="#698cc3"/>'

        #posts = respond.doc('body').html().split(hr_blue)[].split(hr_black)[:]

        if respond.doc('head').html().startswith('<meta'):

            return {

               "tid": respond.url.split('=')[-],

               "url": respond.url,

               "html": 'The specified thread does not exist.',

            }

        return {

            "tid": respond.url.split('=')[-],

            "url": respond.url,

            #"t_author": posts[].split('\n')[].split('<b>')[].strip(), # 用正则更好

            "html": respond.doc.html(),

            #"replies": [i for i in posts[:]]

        }

秒客网

pyspider示例代码六：传递参数

传递参数

相关文章