scrapy爬虫完整实例

1

2

3

4

5

6

7

8

9

10

11

12

									douban

									--douban

									 --spiders

									  --__init__.py

									  --bookspider.py

									  --douban_comment_spider.py

									  --doumailspider.py

									 --__init__.py

									 --items.py

									 --pipelines.py

									 --settings.py

									--scrapy.cfg

1

2

3

4

									# This package will contain the spiders of your Scrapy project

									#

									# Please refer to the documentation for information on how to create and manage

									# your spiders.

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

									# -*- coding:utf-8 -*-

									'''by sudo rm -rf http://imchenkun.com'''

									import scrapy

									from douban.items import DoubanBookItem

									class BookSpider(scrapy.Spider):

									  name = 'douban-book'

									  allowed_domains = ['douban.com']

									  start_urls = [

									    'https://book.douban.com/top250'

									  ]

									  def parse(self, response):

									    # 请求第一页

									    yield scrapy.Request(response.url, callback=self.parse_next)

									    # 请求其它页

									    for page in response.xpath('//div[@class="paginator"]/a'):

									      link = page.xpath('@href').extract()[0]

									      yield scrapy.Request(link, callback=self.parse_next)

									  def parse_next(self, response):

									    for item in response.xpath('//tr[@class="item"]'):

									      book = DoubanBookItem()

									      book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]

									      book['content'] = item.xpath('td[2]/p/text()').extract()[0]

									      book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]

									      yield book

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

									# -*- coding:utf-8 -*-

									import scrapy

									from faker import Factory

									from douban.items import DoubanMovieCommentItem

									import urlparse

									f = Factory.create()

									class MailSpider(scrapy.Spider):

									  name = 'douban-comment'

									  allowed_domains = ['accounts.douban.com', 'douban.com']

									  start_urls = [

									    'https://www.douban.com/'

									  ]

									  headers = {

									    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

									    'Accept-Encoding': 'gzip, deflate, br',

									    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',

									    'Connection': 'keep-alive',

									    'Host': 'accounts.douban.com',

									    'User-Agent': f.user_agent()

									  }

									  formdata = {

									    'form_email': '你的邮箱',

									    'form_password': '你的密码',

									    # 'captcha-solution': '',

									    # 'captcha-id': '',

									    'login': '登录',

									    'redir': 'https://www.douban.com/',

									    'source': 'None'

									  }

									  def start_requests(self):

									    return [scrapy.Request(url='https://www.douban.com/accounts/login',

									                headers=self.headers,

									                meta={'cookiejar': 1},

									                callback=self.parse_login)]

									  def parse_login(self, response):

									    # 如果有验证码要人为处理

									    if 'captcha_image' in response.body:

									      print 'Copy the link:'

									      link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]

									      print link

									      captcha_solution = raw_input('captcha-solution:')

									      captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']

									      self.formdata['captcha-solution'] = captcha_solution

									      self.formdata['captcha-id'] = captcha_id

									    return [scrapy.FormRequest.from_response(response,

									                         formdata=self.formdata,

									                         headers=self.headers,

									                         meta={'cookiejar': response.meta['cookiejar']},

									                         callback=self.after_login

									                         )]

									  def after_login(self, response):

									    print response.status

									    self.headers['Host'] = "www.douban.com"

									    yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',

									               meta={'cookiejar': response.meta['cookiejar']},

									               headers=self.headers,

									               callback=self.parse_comment_url)

									    yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',

									               meta={'cookiejar': response.meta['cookiejar']},

									               headers=self.headers,

									               callback=self.parse_next_page,

									               dont_filter = True)  #不去重

									  def parse_next_page(self, response):

									    print response.status

									    try:

									      next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0])

									      print "下一页"

									      print next_url

									      yield scrapy.Request(url=next_url,

									               meta={'cookiejar': response.meta['cookiejar']},

									               headers=self.headers,

									               callback=self.parse_comment_url,

									               dont_filter = True)

									      yield scrapy.Request(url=next_url,

									               meta={'cookiejar': response.meta['cookiejar']},

									               headers=self.headers,

									               callback=self.parse_next_page,

									               dont_filter = True)

									    except:

									      print "Next page Error"

									      return

									  def parse_comment_url(self, response):

									    print response.status

									    for item in response.xpath('//div[@class="main review-item"]'):

									      comment_url = item.xpath('header/h3[@class="title"]/a/@href').extract()[0]

									      comment_title = item.xpath('header/h3[@class="title"]/a/text()').extract()[0]

									      print comment_title

									      print comment_url

									      yield scrapy.Request(url=comment_url,

									               meta={'cookiejar': response.meta['cookiejar']},

									               headers=self.headers,

									               callback=self.parse_comment)

									  def parse_comment(self, response):

									    print response.status

									    for item in response.xpath('//div[@id="content"]'):

									      comment = DoubanMovieCommentItem()

									      comment['useful_num'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip()

									      comment['no_help_num'] = item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip()

									      comment['people'] = item.xpath('//span[@property="v:reviewer"]/text()').extract()[0]

									      comment['people_url'] = item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0]

									      comment['star'] = item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0]

									      data_type = item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0]

									      print "data_type: "+data_type

									      if data_type == '0':

									        comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div/p/text()').extract()))

									      elif data_type == '1':

									        comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div[1]/text()').extract()))

									      comment['title'] = item.xpath('//span[@property="v:summary"]/text()').extract()[0]

									      comment['comment_page_url'] = response.url

									      #print comment

									      yield comment

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

									# -*- coding:utf-8 -*-

									'''by sudo rm -rf http://imchenkun.com'''

									import scrapy

									from faker import Factory

									from douban.items import DoubanMailItem

									import urlparse

									f = Factory.create()

									class MailSpider(scrapy.Spider):

									  name = 'douban-mail'

									  allowed_domains = ['accounts.douban.com', 'douban.com']

									  start_urls = [

									    'https://www.douban.com/'

									  ]

									  headers = {

									    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

									    'Accept-Encoding': 'gzip, deflate, br',

									    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',

									    'Connection': 'keep-alive',

									    'Host': 'accounts.douban.com',

									    'User-Agent': f.user_agent()

									  }

									  formdata = {

									    'form_email': '你的邮箱',

									    'form_password': '你的密码',

									    # 'captcha-solution': '',

									    # 'captcha-id': '',

									    'login': '登录',

									    'redir': 'https://www.douban.com/',

									    'source': 'None'

									  }

									  def start_requests(self):

									    return [scrapy.Request(url='https://www.douban.com/accounts/login',

									                headers=self.headers,

									                meta={'cookiejar': 1},

									                callback=self.parse_login)]

									  def parse_login(self, response):

									    # 如果有验证码要人为处理

									    if 'captcha_image' in response.body:

									      print 'Copy the link:'

									      link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]

									      print link

									      captcha_solution = raw_input('captcha-solution:')

									      captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']

									      self.formdata['captcha-solution'] = captcha_solution

									      self.formdata['captcha-id'] = captcha_id

									    return [scrapy.FormRequest.from_response(response,

									                         formdata=self.formdata,

									                         headers=self.headers,

									                         meta={'cookiejar': response.meta['cookiejar']},

									                         callback=self.after_login

									                         )]

									  def after_login(self, response):

									    print response.status

									    self.headers['Host'] = "www.douban.com"

									    return scrapy.Request(url='https://www.douban.com/doumail/',

									               meta={'cookiejar': response.meta['cookiejar']},

									               headers=self.headers,

									               callback=self.parse_mail)

									  def parse_mail(self, response):

									    print response.status

									    for item in response.xpath('//div[@class="doumail-list"]/ul/li'):

									      mail = DoubanMailItem()

									      mail['sender_time'] = item.xpath('div[2]/div/span[1]/text()').extract()[0]

									      mail['sender_from'] = item.xpath('div[2]/div/span[2]/text()').extract()[0]

									      mail['url'] = item.xpath('div[2]/p/a/@href').extract()[0]

									      mail['title'] = item.xpath('div[2]/p/a/text()').extract()[0]

									      print mail

									      yield mail

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

									# -*- coding: utf-8 -*-

									import scrapy

									class DoubanBookItem(scrapy.Item):

									  name = scrapy.Field()      # 书名

									  price = scrapy.Field()      # 价格

									  edition_year = scrapy.Field()  # 出版年份

									  publisher = scrapy.Field()    # 出版社

									  ratings = scrapy.Field()     # 评分

									  author = scrapy.Field()     # 作者

									  content = scrapy.Field()

									class DoubanMailItem(scrapy.Item):

									  sender_time = scrapy.Field()   # 发送时间

									  sender_from = scrapy.Field()   # 发送人

									  url = scrapy.Field()       # 豆邮详细地址

									  title = scrapy.Field()      # 豆邮标题

									class DoubanMovieCommentItem(scrapy.Item):

									  useful_num = scrapy.Field()   # 多少人评论有用

									  no_help_num = scrapy.Field()   # 多少人评论无用

									  people = scrapy.Field()     # 评论者

									  people_url = scrapy.Field()   # 评论者页面

									  star = scrapy.Field()      # 评分

									  comment = scrapy.Field()     # 评论

									  title = scrapy.Field()      # 标题

									  comment_page_url = scrapy.Field()# 当前页

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

									# -*- coding: utf-8 -*-

									class DoubanBookPipeline(object):

									  def process_item(self, item, spider):

									    info = item['content'].split(' / ') # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元

									    item['name'] = item['name']

									    item['price'] = info[-1]

									    item['edition_year'] = info[-2]

									    item['publisher'] = info[-3]

									    return item

									class DoubanMailPipeline(object):

									  def process_item(self, item, spider):

									    item['title'] = item['title'].replace(' ', '').replace('\\n', '')

									    return item

									class DoubanMovieCommentPipeline(object):

									  def process_item(self, item, spider):

									    return item

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

									# -*- coding: utf-8 -*-

									# Scrapy settings for douban project

									#

									# For simplicity, this file contains only settings considered important or

									# commonly used. You can find more settings consulting the documentation:

									#

									#   http://doc.scrapy.org/en/latest/topics/settings.html

									#   http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

									#   http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

									BOT_NAME = 'douban'

									SPIDER_MODULES = ['douban.spiders']

									NEWSPIDER_MODULE = 'douban.spiders'

									# Crawl responsibly by identifying yourself (and your website) on the user-agent

									from faker import Factory

									f = Factory.create()

									USER_AGENT = f.user_agent()

									# Obey robots.txt rules

									ROBOTSTXT_OBEY = True

									# Configure maximum concurrent requests performed by Scrapy (default: 16)

									#CONCURRENT_REQUESTS = 32

									# Configure a delay for requests for the same website (default: 0)

									# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay

									# See also autothrottle settings and docs

									#DOWNLOAD_DELAY = 3

									# The download delay setting will honor only one of:

									#CONCURRENT_REQUESTS_PER_DOMAIN = 16

									#CONCURRENT_REQUESTS_PER_IP = 16

									# Disable cookies (enabled by default)

									#COOKIES_ENABLED = False

									# Disable Telnet Console (enabled by default)

									#TELNETCONSOLE_ENABLED = False

									# Override the default request headers:

									DEFAULT_REQUEST_HEADERS = {

									  'Host': 'book.douban.com',

									  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

									  'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',

									  'Accept-Encoding': 'gzip, deflate, br',

									  'Connection': 'keep-alive',

									}

									#DEFAULT_REQUEST_HEADERS = {

									#  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

									#  'Accept-Language': 'en',

									#}

									# Enable or disable spider middlewares

									# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

									#SPIDER_MIDDLEWARES = {

									#  'douban.middlewares.MyCustomSpiderMiddleware': 543,

									#}

									# Enable or disable downloader middlewares

									# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

									#DOWNLOADER_MIDDLEWARES = {

									#  'douban.middlewares.MyCustomDownloaderMiddleware': 543,

									#}

									# Enable or disable extensions

									# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html

									#EXTENSIONS = {

									#  'scrapy.extensions.telnet.TelnetConsole': None,

									#}

									# Configure item pipelines

									# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html

									ITEM_PIPELINES = {

									  #'douban.pipelines.DoubanBookPipeline': 300,

									  #'douban.pipelines.DoubanMailPipeline': 600,

									  'douban.pipelines.DoubanMovieCommentPipeline': 900,

									}

									# Enable and configure the AutoThrottle extension (disabled by default)

									# See http://doc.scrapy.org/en/latest/topics/autothrottle.html

									#AUTOTHROTTLE_ENABLED = True

									# The initial download delay

									#AUTOTHROTTLE_START_DELAY = 5

									# The maximum download delay to be set in case of high latencies

									#AUTOTHROTTLE_MAX_DELAY = 60

									# The average number of requests Scrapy should be sending in parallel to

									# each remote server

									#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

									# Enable showing throttling stats for every response received:

									#AUTOTHROTTLE_DEBUG = False

									# Enable and configure HTTP caching (disabled by default)

									# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

									#HTTPCACHE_ENABLED = True

									#HTTPCACHE_EXPIRATION_SECS = 0

									#HTTPCACHE_DIR = 'httpcache'

									#HTTPCACHE_IGNORE_HTTP_CODES = []

									#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

1

2

3

4

5

6

7

8

9

10

11

									# Automatically created by: scrapy startproject

									#

									# For more information about the [deploy] section see:

									# https://scrapyd.readthedocs.org/en/latest/deploy.html

									[settings]

									default = douban.settings

									[deploy]

									#url = http://localhost:6800/

									project = douban

1

2

3

4

5

6

7

8

9

10

11

									douban_imgs

									--douban

									 --spiders

									  --__init__.py

									  --download_douban.py

									 --__init__.py

									 --items.py

									 --pipelines.py

									 --run_spider.py

									 --settings.py

									--scrapy.cfg

1

2

3

4

									# This package will contain the spiders of your Scrapy project

									#

									# Please refer to the documentation for information on how to create and manage

									# your spiders.

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

									# coding=utf-8

									from scrapy.spiders import Spider

									import re

									from scrapy import Request

									from douban_imgs.items import DoubanImgsItem

									class download_douban(Spider):

									  name = 'download_douban'

									  default_headers = {

									    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

									    'Accept-Encoding': 'gzip, deflate, sdch, br',

									    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',

									    'Cache-Control': 'max-age=0',

									    'Connection': 'keep-alive',

									    'Host': 'www.douban.com',

									    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',

									  }

									  def __init__(self, url='1638835355', *args, **kwargs):

									    self.allowed_domains = ['douban.com']

									    self.start_urls = [

									      'http://www.douban.com/photos/album/%s/' % (url)]

									    self.url = url

									    # call the father base function

									    #super(download_douban, self).__init__(*args, **kwargs)

									  def start_requests(self):

									    for url in self.start_urls:

									      yield Request(url=url, headers=self.default_headers, callback=self.parse)

									  def parse(self, response):

									    list_imgs = response.xpath('//div[@class="photolst clearfix"]//img/@src').extract()

									    if list_imgs:

									      item = DoubanImgsItem()

									      item['image_urls'] = list_imgs

									      yield item

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

									# -*- coding: utf-8 -*-

									# Define here the models for your scraped items

									#

									# See documentation in:

									# http://doc.scrapy.org/en/latest/topics/items.html

									import scrapy

									from scrapy import Item, Field

									class DoubanImgsItem(scrapy.Item):

									  # define the fields for your item here like:

									  # name = scrapy.Field()

									  image_urls = Field()

									  images = Field()

									  image_paths = Field()

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

									# -*- coding: utf-8 -*-

									# Define your item pipelines here

									#

									# Don't forget to add your pipeline to the ITEM_PIPELINES setting

									# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

									from scrapy.pipelines.images import ImagesPipeline

									from scrapy.exceptions import DropItem

									from scrapy import Request

									from scrapy import log

									class DoubanImgsPipeline(object):

									  def process_item(self, item, spider):

									    return item

									class DoubanImgDownloadPipeline(ImagesPipeline):

									  default_headers = {

									    'accept': 'image/webp,image/*,*/*;q=0.8',

									    'accept-encoding': 'gzip, deflate, sdch, br',

									    'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',

									    'cookie': 'bid=yQdC/AzTaCw',

									    'referer': 'https://www.douban.com/photos/photo/2370443040/',

									    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',

									  }

									  def get_media_requests(self, item, info):

									    for image_url in item['image_urls']:

									      self.default_headers['referer'] = image_url

									      yield Request(image_url, headers=self.default_headers)

									  def item_completed(self, results, item, info):

									    image_paths = [x['path'] for ok, x in results if ok]

									    if not image_paths:

									      raise DropItem("Item contains no images")

									    item['image_paths'] = image_paths

									    return item

1

2

3

									from scrapy import cmdline

									cmd_str = 'scrapy crawl download_douban'

									cmdline.execute(cmd_str.split(' '))

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

									# -*- coding: utf-8 -*-

									# Scrapy settings for douban_imgs project

									#

									# For simplicity, this file contains only settings considered important or

									# commonly used. You can find more settings consulting the documentation:

									#

									#   http://doc.scrapy.org/en/latest/topics/settings.html

									#   http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

									#   http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

									BOT_NAME = 'douban_imgs'

									SPIDER_MODULES = ['douban_imgs.spiders']

									NEWSPIDER_MODULE = 'douban_imgs.spiders'

									# Crawl responsibly by identifying yourself (and your website) on the user-agent

									# USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)'

									# Configure maximum concurrent requests performed by Scrapy (default: 16)

									# CONCURRENT_REQUESTS=32

									# Configure a delay for requests for the same website (default: 0)

									# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay

									# See also autothrottle settings and docs

									# DOWNLOAD_DELAY=3

									# The download delay setting will honor only one of:

									# CONCURRENT_REQUESTS_PER_DOMAIN=16

									# CONCURRENT_REQUESTS_PER_IP=16

									# Disable cookies (enabled by default)

									# COOKIES_ENABLED=False

									# Disable Telnet Console (enabled by default)

									# TELNETCONSOLE_ENABLED=False

									# Override the default request headers:

									# DEFAULT_REQUEST_HEADERS = {

									#  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

									#  'Accept-Language': 'en',

									# }

									# Enable or disable spider middlewares

									# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

									# SPIDER_MIDDLEWARES = {

									#  'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543,

									# }

									# Enable or disable downloader middlewares

									# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

									# DOWNLOADER_MIDDLEWARES = {

									#  'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543,

									# }

									# Enable or disable extensions

									# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html

									# EXTENSIONS = {

									#  'scrapy.telnet.TelnetConsole': None,

									# }

									# Configure item pipelines

									# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html

									ITEM_PIPELINES = {

									  'douban_imgs.pipelines.DoubanImgDownloadPipeline': 300,

									}

									IMAGES_STORE = 'D:\\doubanimgs'

									#IMAGES_STORE = '/tmp'

									IMAGES_EXPIRES = 90

									# Enable and configure the AutoThrottle extension (disabled by default)

									# See http://doc.scrapy.org/en/latest/topics/autothrottle.html

									# NOTE: AutoThrottle will honour the standard settings for concurrency and delay

									# AUTOTHROTTLE_ENABLED=True

									# The initial download delay

									# AUTOTHROTTLE_START_DELAY=5

									# The maximum download delay to be set in case of high latencies

									# AUTOTHROTTLE_MAX_DELAY=60

									# Enable showing throttling stats for every response received:

									# AUTOTHROTTLE_DEBUG=False

									# Enable and configure HTTP caching (disabled by default)

									# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

									# HTTPCACHE_ENABLED=True

									# HTTPCACHE_EXPIRATION_SECS=0

									# HTTPCACHE_DIR='httpcache'

									# HTTPCACHE_IGNORE_HTTP_CODES=[]

									# HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'

1

2

3

4

5

6

7

8

9

10

11

									# Automatically created by: scrapy startproject

									#

									# For more information about the [deploy] section see:

									# https://scrapyd.readthedocs.org/en/latest/deploy.html

									[settings]

									default = douban_imgs.settings

									[deploy]

									#url = http://localhost:6800/

									project = douban_imgs

秒客网

scrapy爬虫完整实例

例程1： douban

例程2： douban_imgs

相关文章