本文实例讲述了Python实现在线程里运行scrapy的方法。分享给大家供大家参考。具体如下:
如果你希望在一个写好的程序里调用scrapy,就可以通过下面的代码,让scrapy运行在一个线程里。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
"""
Code to run Scrapy crawler in a thread - works on Scrapy 0.8
"""
import threading, Queue
from twisted.internet import reactor
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core.manager import scrapymanager
from scrapy.core.engine import scrapyengine
from scrapy.core import signals
class CrawlerThread(threading.Thread):
def __init__( self ):
threading.Thread.__init__( self )
self .running = False
def run( self ):
self .running = True
scrapymanager.configure(control_reactor = False )
scrapymanager.start()
reactor.run(installSignalHandlers = False )
def crawl( self , * args):
if not self .running:
raise RuntimeError( "CrawlerThread not running" )
self ._call_and_block_until_signal(signals.spider_closed, \
scrapymanager.crawl, * args)
def stop( self ):
reactor.callFromThread(scrapyengine.stop)
def _call_and_block_until_signal( self , signal, f, * a, * * kw):
q = Queue.Queue()
def unblock():
q.put( None )
dispatcher.connect(unblock, signal = signal)
reactor.callFromThread(f, * a, * * kw)
q.get()
# Usage example below:
import os
os.environ.setdefault( 'SCRAPY_SETTINGS_MODULE' , 'myproject.settings' )
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
from scrapy.conf import settings
from scrapy.crawler import CrawlerThread
settings.overrides[ 'LOG_ENABLED' ] = False # avoid log noise
def item_passed(item):
print "Just scraped item:" , item
dispatcher.connect(item_passed, signal = signals.item_passed)
crawler = CrawlerThread()
print "Starting crawler thread..."
crawler.start()
print "Crawling somedomain.com...."
crawler.crawl('somedomain.com) # blocking call
print "Crawling anotherdomain.com..."
crawler.crawl( 'anotherdomain.com' ) # blocking call
print "Stopping crawler thread..."
crawler.stop()
|
希望本文所述对大家的Python程序设计有所帮助。