传递参数
示例一
#!/usr/bin/env python
# -*- encoding: utf- -*-
# vim: set et sw= ts= sts= ff=unix fenc=utf8:
# Created on -- :: import re
import json
from libs.pprint import pprint
from libs.base_handler import * class Handler(BaseHandler):
'''
this is a sample handler
'''
crawl_config = {
}
proxy = "" @every(, )
def on_start(self):
self.crawl(self.proxy+'http://www.douban.com/group/haixiuzu/discussion',
force_update=True, callback=self.index_page) @config(age=)
def index_page(self, response):
for each in response.doc('tr > .title > a').items():
self.crawl(self.proxy+each.attr.href, callback=self.detail_page) @config(age=***)
def detail_page(self, response):
assert response.url != "https://www.douban.com/"
return {
"url": response.url,
"title": response.doc("#content h1").text(),
"author": response.doc(".topic-content .from a").text(),
"author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href,
"imgs": [x.attr.src for x in response.doc('.topic-doc img').items()]
} def on_result(self, result):
if not result or not result['imgs']:
return
post_id = re.search("topic/(\d+)", self.response.url).group()
self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST",
data={
"short_name": "database",
"secret": "8e5a5be8873ad7e9a59147c3cfd10e73",
"posts[0][post_key]": post_id,
"posts[0][thread_key]": "haixiuzu",
"posts[0][message]": json.dumps(result).encode("base64").replace("\n", "")
}, callback=self.post_to_duoshuo) def post_to_duoshuo(self):
pass
示例二
#!/usr/bin/env python
# -*- encoding: utf- -*-
# Created on -- ::
# Project: prieto import re
from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = {
} @every(minutes= * )
def on_start(self):
for i in range():
self.crawl('data:,step%d' % i, callback=self.gen_url, save=i) @config(priority=)
def gen_url(self, respond):
for i in range(respond.save * , (respond.save + ) * ):
self.crawl("http://bbs.fobshanghai.com/viewthread.php?action=printable&tid=%d" % i, callback=self.index_page) @config(priority=)
def index_page(self, respond): # title = response.doc
hr_black = u'<hr noshade="noshade" size="2" width="100%" color="#808080"/>'
hr_blue = u'<br/><br/><br/><br/><hr noshade="noshade" size="2" width="100%" color="#698cc3"/>' #posts = respond.doc('body').html().split(hr_blue)[].split(hr_black)[:] if respond.doc('head').html().startswith('<meta'):
return {
"tid": respond.url.split('=')[-],
"url": respond.url,
"html": 'The specified thread does not exist.',
} return {
"tid": respond.url.split('=')[-],
"url": respond.url,
#"t_author": posts[].split('\n')[].split('<b>')[].strip(), # 用正则更好
"html": respond.doc.html(),
#"replies": [i for i in posts[:]]
}