大家可以在github上clone全部源码。
github:https://github.com/williamzxl/scrapy_crawlmeizitu
scrapy官方文档:http://scrapy-chs.readthedocs.io/zh_cn/latest/index.html
基本上按照文档的流程走一遍就基本会用了。
step1:
在开始爬取之前,必须创建一个新的scrapy项目。 进入打算存储代码的目录中,运行下列命令:
1
|
scrapy startproject crawlmeizitu
|
该命令将会创建包含下列内容的 tutorial 目录:
1
2
3
4
5
6
7
8
9
10
11
12
13
|
crawlmeizitu /
scrapy.cfg
crawlmeizitu /
__init__.py
items.py
pipelines.py
settings.py
middlewares.py
spiders /
__init__.py
...
cd crawlmeizitu
scrapy genspider meizitu http: / / www.meizitu.com / a / list_1_1.html
|
该命令将会创建包含下列内容的 tutorial 目录:
1
2
3
4
5
6
7
8
9
10
11
12
|
crawlmeizitu /
scrapy.cfg
crawlmeizitu /
__init__.py
items.py
pipelines.py
settings.py
middlewares.py
spiders /
meizitu.py
__init__.py
...
|
我们主要编辑的就如下图箭头所示:
main.py是后来加上的,加了两条命令,
1
2
|
from scrapy import cmdline
cmdline.execute( "scrapy crawl meizitu" .split())
|
主要为了方便运行。
step2:编辑settings,如下图所示
1
2
3
4
5
6
7
8
9
10
11
|
bot_name = 'crawlmeizitu'
spider_modules = [ 'crawlmeizitu.spiders' ]
newspider_module = 'crawlmeizitu.spiders'
item_pipelines = {
'crawlmeizitu.pipelines.crawlmeizitupipeline' : 300 ,
}
images_store = 'd://pic2'
download_delay = 0.3
user_agent = 'mozilla/5.0 (windows nt 6.1; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/58.0.3029.110 safari/537.36'
robotstxt_obey = true
|
主要设置user_agent,下载路径,下载延迟时间
step3:编辑items.
items主要用来存取通过spider程序抓取的信息。由于我们爬取妹子图,所以要抓取每张图片的名字,图片的连接,标签等等
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# -*- coding: utf-8 -*-
# define here the models for your scraped items
#
# see documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class crawlmeizituitem(scrapy.item):
# define the fields for your item here like:
# name = scrapy.field()
#title为文件夹名字
title = scrapy.field()
url = scrapy.field()
tags = scrapy.field()
#图片的连接
src = scrapy.field()
#alt为图片名字
alt = scrapy.field()
|
step4:编辑pipelines
pipelines主要对items里面获取的信息进行处理。比如说根据title创建文件夹或者图片的名字,根据图片链接下载图片。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# -*- coding: utf-8 -*-
import os
import requests
from crawlmeizitu.settings import images_store
class crawlmeizitupipeline( object ):
def process_item( self , item, spider):
fold_name = "".join(item[ 'title' ])
header = {
'user-agent' : 'user-agent:mozilla/5.0 (windows nt 6.1; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/58.0.3029.110 safari/537.36' ,
'cookie' : 'b963ef2d97e050aaf90fd5fab8e78633' ,
#需要查看图片的cookie信息,否则下载的图片无法查看
}
images = []
# 所有图片放在一个文件夹下
dir_path = '{}' . format (images_store)
if not os.path.exists(dir_path) and len (item[ 'src' ]) ! = 0 :
os.mkdir(dir_path)
if len (item[ 'src' ]) = = 0 :
with open ( '..//check.txt' , 'a+' ) as fp:
fp.write(" ".join(item['title']) + " : " + " ".join(item[ 'url' ]))
fp.write( "\n" )
for jpg_url, name, num in zip (item[ 'src' ], item[ 'alt' ], range ( 0 , 100 )):
file_name = name + str (num)
file_path = '{}//{}' . format (dir_path, file_name)
images.append(file_path)
if os.path.exists(file_path) or os.path.exists(file_name):
continue
with open ( '{}//{}.jpg' . format (dir_path, file_name), 'wb' ) as f:
req = requests.get(jpg_url, headers = header)
f.write(req.content)
return item
|
step5:编辑meizitu的主程序。
最重要的主程序:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
# -*- coding: utf-8 -*-
import scrapy
from crawlmeizitu.items import crawlmeizituitem
#from crawlmeizitu.items import crawlmeizituitempage
import time
class meizituspider(scrapy.spider):
name = "meizitu"
#allowed_domains = ["meizitu.com/"]
start_urls = []
last_url = []
with open ( '..//url.txt' , 'r' ) as fp:
crawl_urls = fp.readlines()
for start_url in crawl_urls:
last_url.append(start_url.strip( '\n' ))
start_urls.append("".join(last_url[ - 1 ]))
def parse( self , response):
selector = scrapy.selector(response)
#item = crawlmeizituitempage()
next_pages = selector.xpath( '//*[@id="wp_page_numbers"]/ul/li/a/@href' ).extract()
next_pages_text = selector.xpath( '//*[@id="wp_page_numbers"]/ul/li/a/text()' ).extract()
all_urls = []
if '下一页' in next_pages_text:
next_url = "http://www.meizitu.com/a/{}" . format (next_pages[ - 2 ])
with open ( '..//url.txt' , 'a+' ) as fp:
fp.write( '\n' )
fp.write(next_url)
fp.write( "\n" )
request = scrapy.http.request(next_url, callback = self .parse)
time.sleep( 2 )
yield request
all_info = selector.xpath( '//h3[@class="tit"]/a' )
#读取每个图片夹的连接
for info in all_info:
links = info.xpath( '//h3[@class="tit"]/a/@href' ).extract()
for link in links:
request = scrapy.http.request(link, callback = self .parse_item)
time.sleep( 1 )
yield request
# next_link = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
# next_link_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract()
# if '下一页' in next_link_text:
# nextpage = "http://www.meizitu.com/a/{}".format(next_link[-2])
# item['page_url'] = nextpage
# yield item
#抓取每个文件夹的信息
def parse_item( self , response):
item = crawlmeizituitem()
selector = scrapy.selector(response)
image_title = selector.xpath( '//h2/a/text()' ).extract()
image_url = selector.xpath( '//h2/a/@href' ).extract()
image_tags = selector.xpath( '//div[@class="metaright"]/p/text()' ).extract()
if selector.xpath( '//*[@id="picture"]/p/img/@src' ).extract():
image_src = selector.xpath( '//*[@id="picture"]/p/img/@src' ).extract()
else :
image_src = selector.xpath( '//*[@id="maincontent"]/div/p/img/@src' ).extract()
if selector.xpath( '//*[@id="picture"]/p/img/@alt' ).extract():
pic_name = selector.xpath( '//*[@id="picture"]/p/img/@alt' ).extract()
else :
pic_name = selector.xpath( '//*[@id="maincontent"]/div/p/img/@alt' ).extract()
#//*[@id="maincontent"]/div/p/img/@alt
item[ 'title' ] = image_title
item[ 'url' ] = image_url
item[ 'tags' ] = image_tags
item[ 'src' ] = image_src
item[ 'alt' ] = pic_name
print (item)
time.sleep( 1 )
yield item
|
总结
以上所述是小编给大家介绍的python使用scrapy爬虫框架全站爬取图片并保存本地的实现代码,希望对大家有所帮助,如果大家啊有任何疑问欢迎给我留言,小编会及时回复大家的!
原文链接:https://www.cnblogs.com/william126/p/6923017.html