python爬取图片并保存到本地

时间:2022-11-12 10:36:11

Python爬取图片(你懂得)

requests与Bs4

这两个模块是本文使用的主要模块,requests可以获取连接,bs4全名BeautifulSoup,是编写python爬虫常用库之一,主要用来解析html标签。这两个模块可以通过cmd终端下载

pip install bs4
pip install requests

代码实现

import requests
from bs4 import BeautifulSoup
import os
class Mzitu():
    def __init__(self):
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }# 构造请求头,主网站的请求头较为简单只需构造浏览器头
        self.base_path = os.getcwd() # 获取当前路径

    def get_url(self,html):
    '''获取每个套图的链接,并返回'''
        html_b=BeautifulSoup(html,'lxml')
        urls_b = html_b.find_all('ul',attrs={'id':'pins'})[0]
        urls = urls_b.find_all('a')
        for i in urls:
            yield i['href']

    def get_img_url_max(self,url):
    '''获取图片的张数'''
        html_i = requests.get(url,headers=self.headers).text
        html_b = BeautifulSoup(html_i,'lxml')
        max_number=html_b.find_all('div',attrs={'class':'pagenavi'})[0]
        max_number = max_number.find_all('a')[-2].span.text
        return max_number

    def get_img_url(self,url):
    '''获取每张图片的链接'''
        html_i = requests.get(url, headers=self.headers).text
        html_b = BeautifulSoup(html_i, 'lxml')
        img_url = html_b.find_all('div',attrs={'class':'main-image'})[0].p.a.img['src']
        return img_url

    def download_img(self,name,url):
    '''获取每张图片的内容'''
        headers = {
            'Accept':'image/webp,image/apng,image/*,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Connection':'keep-alive',
            'Host': 'i.meizitu.net',
            'Referer': 'http://www.mzitu.com/%s'%name,
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }
        img = requests.get(url,headers=headers).content
        return img

    def get_img(self,name,max,img_url):
    '''下载图片'''
        path = os.path.join(self.base_path,name)
        if os.path.exists(path):
            pass
        else:
            os.mkdir(path)
        for i in range(1,int(max)):
            k = str(i)
            file_name = k+'.jpg'
            img_file_name = os.path.join(path,file_name)
            if len(k) <2:
                img_url = img_url[:-5]+k+img_url[-4:]
            else:
                img_url = img_url[:-6]+k+img_url[-4:]
            img = self.download_img(name,img_url)
            with open(img_file_name,'wb') as f:
                f.write(img)

    def get_html_url_link_max(self):
    '''获取主网站中的总页数'''
        url = 'http://www.mzitu.com/'
        html = requests.get(url,headers = self.headers).text
        html_b = BeautifulSoup(html,'lxml')
        max_number = html_b.find_all('a',attrs={'class':'page-numbers'})[-2]['href']
        max_number = max_number.split('/')[4]
        return max_number

    def main(self):
        max_number = int(self.get_html_url_link_max())
        for i in range(1,max_number+1):
        '''遍历构造网址'''
            url = 'http://www.mzitu.com/page/%d/'%i
            html = requests.get(url,headers=self.headers).text
            urls = self.get_url(html)
            for i in urls:
                name = i.split('/')[-1]
                max_number = self.get_img_url_max(i)
                img_url = self.get_img_url(i)
                self.get_img(name,max_number,img_url)

if __name__ == '__main__':
    mzitu = Mzitu()
    mzitu.main()

运行程序后,即可在同文件夹下发现不断有包含图片的文件夹生成

封装后的exe下载