本文实例讲述了基于scrapy实现的简单蜘蛛采集程序。分享给大家供大家参考。具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
# Standard Python library imports
# 3rd party imports
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
# My imports
from poetry_analysis.items import PoetryAnalysisItem
HTML_FILE_NAME = r '.+\.html'
class PoetryParser( object ):
"""
Provides common parsing method for poems formatted this one specific way.
"""
date_pattern = r '(\d{2} \w{3,9} \d{4})'
def parse_poem( self , response):
hxs = HtmlXPathSelector(response)
item = PoetryAnalysisItem()
# All poetry text is in pre tags
text = hxs.select( '//pre/text()' ).extract()
item[ 'text' ] = ''.join(text)
item[ 'url' ] = response.url
# head/title contains title - a poem by author
title_text = hxs.select( '//head/title/text()' ).extract()[ 0 ]
item[ 'title' ], item[ 'author' ] = title_text.split( ' - ' )
item[ 'author' ] = item[ 'author' ].replace( 'a poem by' , '')
for key in [ 'title' , 'author' ]:
item[key] = item[key].strip()
item[ 'date' ] = hxs.select( "//p[@class='small']/text()" ).re(date_pattern)
return item
class PoetrySpider(CrawlSpider, PoetryParser):
name = 'example.com_poetry'
allowed_domains = [ 'www.example.com' ]
root_path = 'someuser/poetry/'
start_urls = [ 'http://www.example.com/someuser/poetry/recent/' ,
'http://www.example.com/someuser/poetry/less_recent/' ]
rules = [Rule(SgmlLinkExtractor(allow = [start_urls[ 0 ] + HTML_FILE_NAME]),
callback = 'parse_poem' ),
Rule(SgmlLinkExtractor(allow = [start_urls[ 1 ] + HTML_FILE_NAME]),
callback = 'parse_poem' )]
|
希望本文所述对大家的Python程序设计有所帮助。