本节课介绍了scrapy的爬虫框架,重点说了scrapy组件spider。
spider的几种爬取方式:
- 爬取1页内容
- 按照给定列表拼出链接爬取多页
- 找到‘下一页'标签进行爬取
- 进入链接,按照链接进行爬取
下面分别给出了示例
1.爬取1页内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
#by 寒小阳(hanxiaoyang.ml@gmail.com)
import scrapy
class JulyeduSpider(scrapy.Spider):
name = "julyedu"
start_urls = [
'https://www.julyedu.com/category/index' ,
]
def parse( self , response):
for julyedu_class in response.xpath( '//div[@class="course_info_box"]' ):
print julyedu_class.xpath( 'a/h4/text()' ).extract_first()
print julyedu_class.xpath( 'a/p[@class="course-info-tip"][1]/text()' ).extract_first()
print julyedu_class.xpath( 'a/p[@class="course-info-tip"][2]/text()' ).extract_first()
print response.urljoin(julyedu_class.xpath( 'a/img[1]/@src' ).extract_first())
print "\n"
yield {
'title' :julyedu_class.xpath( 'a/h4/text()' ).extract_first(),
'desc' : julyedu_class.xpath( 'a/p[@class="course-info-tip"][1]/text()' ).extract_first(),
'time' : julyedu_class.xpath( 'a/p[@class="course-info-tip"][2]/text()' ).extract_first(),
'img_url' : response.urljoin(julyedu_class.xpath( 'a/img[1]/@src' ).extract_first())
}
|
2.按照给定列表拼出链接爬取多页
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
#by 寒小阳(hanxiaoyang.ml@gmail.com)
import scrapy
class CnBlogSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = [ "cnblogs.com" ]
start_urls = [
'http://www.cnblogs.com/pick/#p%s' % p for p in xrange ( 1 , 11 )
]
def parse( self , response):
for article in response.xpath( '//div[@class="post_item"]' ):
print article.xpath( 'div[@class="post_item_body"]/h3/a/text()' ).extract_first().strip()
print response.urljoin(article.xpath( 'div[@class="post_item_body"]/h3/a/@href' ).extract_first()).strip()
print article.xpath( 'div[@class="post_item_body"]/p/text()' ).extract_first().strip()
print article.xpath( 'div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()' ).extract_first().strip()
print response.urljoin(article.xpath( 'div[@class="post_item_body"]/div/a/@href' ).extract_first()).strip()
print article.xpath( 'div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()' ).extract_first().strip()
print article.xpath( 'div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()' ).extract_first().strip()
print ""
yield {
'title' : article.xpath( 'div[@class="post_item_body"]/h3/a/text()' ).extract_first().strip(),
'link' : response.urljoin(article.xpath( 'div[@class="post_item_body"]/h3/a/@href' ).extract_first()).strip(),
'summary' : article.xpath( 'div[@class="post_item_body"]/p/text()' ).extract_first().strip(),
'author' : article.xpath( 'div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()' ).extract_first().strip(),
'author_link' : response.urljoin(article.xpath( 'div[@class="post_item_body"]/div/a/@href' ).extract_first()).strip(),
'comment' : article.xpath( 'div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()' ).extract_first().strip(),
'view' : article.xpath( 'div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()' ).extract_first().strip(),
}
|
3.找到‘下一页'标签进行爬取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/tag/humor/' ,
]
def parse( self , response):
for quote in response.xpath( '//div[@class="quote"]' ):
yield {
'text' : quote.xpath( 'span[@class="text"]/text()' ).extract_first(),
'author' : quote.xpath( 'span/small[@class="author"]/text()' ).extract_first(),
}
next_page = response.xpath( '//li[@class="next"]/@herf' ).extract_first()
if next_page is not None :
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback = self .parse)
|
4.进入链接,按照链接进行爬取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
#by 寒小阳(hanxiaoyang.ml@gmail.com)
import scrapy
class QQNewsSpider(scrapy.Spider):
name = 'qqnews'
start_urls = [ 'http://news.qq.com/society_index.shtml' ]
def parse( self , response):
for href in response.xpath( '//*[@id="news"]/div/div/div/div/em/a/@href' ):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback = self .parse_question)
def parse_question( self , response):
print response.xpath( '//div[@class="qq_article"]/div/h1/text()' ).extract_first()
print response.xpath( '//span[@class="a_time"]/text()' ).extract_first()
print response.xpath( '//span[@class="a_catalog"]/a/text()' ).extract_first()
print "\n" .join(response.xpath( '//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()' ).extract())
print ""
yield {
'title' : response.xpath( '//div[@class="qq_article"]/div/h1/text()' ).extract_first(),
'content' : "\n" .join(response.xpath( '//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()' ).extract()),
'time' : response.xpath( '//span[@class="a_time"]/text()' ).extract_first(),
'cate' : response.xpath( '//span[@class="a_catalog"]/a/text()' ).extract_first(),
}
|
总结
以上就是本文关于scrapy spider的几种爬取方式实例代码的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!
原文链接:http://blog.csdn.net/nnnnnnnnnnnny/article/details/54342423