Python爬取 | 唯美女生图片

这里只是代码展示，且复制后不能直接运行，需要配置一些设置才行，具体请查看下方链接介绍：

Python爬取 | 唯美女生图片

from selenium import webdriver

from fake_useragent import UserAgent

from pyquery import PyQuery as pq

import winreg

from time import sleep

import time

import requests

import re

import os

header = {

    'User-Agent': UserAgent().random

}

'''

获取单个id的HTML代码并解析，返回id的分类、名字、包含图片链接的list

'''

def html_id(id_url):

    r = requests.get(id_url, headers=header)

    time.sleep(0.3)

    doc = pq(r.text)

    classical = doc('.d-md-inline-block').eq(0).children('a').text() # 获取id的分类

    if len(classical) != 0: # 判断id的分类是否获取成功，即判断该id的源码是否获取成功

        name = doc('.post-title').text() # id 名字

        lists = doc('.nc-light-gallery a').items() # id的图片所在标签

        links = ['https:' + i.attr('href') for i in lists if '.' in i.attr('href')] # 解析标签，获取图片链接

        if len(links) == 0: # 几年前的id,图片所在的标签与前面的不同，所以需要重新解析

            lists = doc('.nc-light-gallery img').items()

            links = ['https:' + i.attr('src') for i in lists if '.' in i.attr('src')]

        return [classical, name, links]

    else: # id 对应链接源码获取失败

        d = id_url.split('/')[-1].split('.')[0] # 获取为成功获取源码的id

        print(f'{d} 获取失败，等待下一次循环')

        return 0

'''

下载图片

'''

def download(id, con, path, path3):

    num = 1  # 用于下载的图片计数

    classical = con[0] # id 分类

    name = con[1] # id 名字

    links = con[2] # id 所含图片链接

    print(f'{id} {classical} {name} 下载中...', end=' ')

    img_path = path + '\\' + classical # 创建对应分类的文件夹

    if not os.path.exists(img_path): # 判断文件夹是否创建

        os.mkdir(img_path)

    print(f'共{len(links)}张 ——> ', end='')

    for j in links: # 遍历列表，下载

        names = img_path + '\\' + name + str(num) + os.path.splitext(j)[1] # 文件名变量

        if 't.cdn.ink' not in j: # 判断图片链接是否规范，后面有些图片的链接是不规范的

            j = j[:6] + '//t.cdn.ink/' + j[6:]

        try:

            with open(names, 'wb') as f: # 下载

                f.write(requests.get(j, headers=header).content)

            print(f'{num} ', end='')

        except Exception as e:

            print(f'\n第{num}张下载错误，错误来自：{e} ')

        num = num + 1  # 计数

    # 将下载过的ID写入id_haven.txt 文件中

    with open(path3, 'a+', encoding='utf-8') as f:

        s = classical + ',' + name + ',' + id + '\n'

        f.write(s)

        print('下载完成！！！')

'''

从TXT文件里获取ID，并返回列表

'''

def txt_id(path):

    if 'haven' in path: # 从id_haven.txt TXT文件里获取已下载的ID

        id_haven = []

        if os.path.exists(path):

            with open(path, 'r', encoding="ISO-8859-1") as f:

                a = f.readlines()

            for i in a:

                id_haven.append(i.split(',')[-1].strip())

        return id_haven

    else:

        with open(path, 'r') as f: # 从id_all.txt 和 id_not.txt TXT文件里获取已下载的ID

            id_all = f.readlines()

        id_all = [int(i.rstrip()) for i in id_all]

        id_all.sort(reverse=True) # 排序

        id_all = [str(i) for i in id_all]

        return id_all

'''

保存html页面源代码，并获取html里的所有id

'''

def get_id(html, path):

    # 保存HTML源代码

    path_html = path + r'\html源代码'  # 源代码保存路径

    if not os.path.exists(path_html): # 创建路径文件夹

        os.mkdir(path_html)

    with open(path_html + r'\vm_girls.html', 'w', encoding='utf-8') as f: # 写入vm_girls.html文件中

        f.write(html)

    # 开始解析源代码里的id

    doc = pq(html)

    a_html = doc('.media-3x2 a') # 解析的id存在于每个a标签的href属性里，所有的属性值解析到一个列表里

    ids = []

    for i in a_html:

        url = pq(i).attr('href')

        id = re.search('\d+', url.split('/')[-1]).group() # 用正则表达式读取id

        ids.append(int(id))

    ids.sort() # 将id从小到大排序

    ids = [str(i) for i in ids]

    with open(path + r'\ID_all.txt', 'w') as f:

        f.write('\n'.join(ids))

    with open(path + r'\ID_not.txt', 'w') as f:

        f.write('\n'.join(ids))

'''

获取加载页面全部源代码

'''

def get_html(url, chromedriver_path):

    wb = webdriver.Chrome(executable_path=chromedriver_path)

    wb.implicitly_wait(5)

    wb.get(url)

    start_time = time.time()

    # wb.find_element_by_class_name('nwmb-vdprs-close').click() #用于初次加载界面时弹出的广告框

    flag = True     # 如果等得不耐烦，任意按下键盘的一个按键，即可加载终止，开始后面的程序

    wb.execute_script('''

        document.body.addEventListener("keypress", function(){ document.getElementsByClassName('dposts-ajax-load')[0].innerText='加载终止'; });

        ''')

    while flag:

        try:

            end = wb.find_element_by_class_name('dposts-ajax-load').text

            if end in ['没有更多内容', '加载终止']:

                print(end)

                flag = False

            else:

                wb.find_element_by_class_name('dposts-ajax-load').click()

        except:

            sleep(1)

        finally:

            wb.execute_script("window.scrollTo(0, document.body.scrollHeight-1532)")  # 这里的1532，可能需要对于不同窗口的电脑，做适度调整

    html = wb.page_source

    print(wb.title)

    wb.quit()

    end_time = time.time()

    times = end_time - start_time

    print(f'加载内容总耗时{times // 60:.0f}分{times % 60:.2f}秒！')

    return html

'''

获取当前电脑桌面路径

'''

def get_desktop():

    key = winreg.OpenKey(winreg.HKEY_CURRENT_USER,

                         r'Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders') # 利用系统的链表

    zm = winreg.QueryValueEx(key, "Desktop")[0] # 获取的是Unicode类型数据

    return str(zm) # Unicode转化为str并返回

def main():

    url = 'https://www.vmgirls.com/'  # url链接

    path = get_desktop() + r'\vmGirls'

    if not os.path.exists(path): # 创建路径文件夹

        os.mkdir(path)

    chromedriver_path = get_desktop() + r'\chromedriver.exe'  # 浏览器驱动器路径

    judge = True

    if os.path.exists(path + r'\html源代码\vm_girls.html'):

        judge = input('html源代码已存在，是否需要重新加载：')

        if judge == '否':

            judge = False

        else:

            judge = True

    if judge:

        html = get_html(url, chromedriver_path) # 自动获取html源代码

        get_id(html, path) # 保存源代码并解析源代码里的所有id

    path1 = path + '\\ID_all.txt'  # 保存解析的所有id

    path2 = path + '\\ID_not.txt'  # 保存未下载的所有id

    path3 = path + '\\ID_haven.txt'  # 保存已下载的所有id

    # 全ID自动遍历下载

    id_not = txt_id(path2)

    id_haven = txt_id(path3)

    cycle = 0  # 计循环次数

    start_time = time.time()

    while len(id_not) > 5:

        cycle += 1

        id_all_1 = txt_id(path1)

        id_all_2 = txt_id(path1)

        for i in set(id_haven): # 在存在列表里检查ID是否已存在

            id_all_1.remove(i)

        for i in id_all_1: # 下载未下载的ID

            id_url = url + i + '.html'

            con = html_id(id_url)

            if con: # 判断此id的HTML界面是否获取成功

                download(i, con, path, path3)

        all_haven = txt_id(path3)

        remain = len(id_all_2) - len(all_haven)

        print(f'第{cycle}次循环，还剩下{remain}个ID未下载！')

        for i in set(all_haven): # 在存在列表里检查ID是否已存在

            id_all_2.remove(i)

        with open(path2, 'w') as f: # 未下载的ID存入id_not.txt文件

            f.write('\n'.join(id_all_2))

        time.sleep(2)

    else:

        print('结束')

    end_time = time.time()

    times = end_time - start_time

    print(f'下载总耗时{times // 60:.0f}分{times % 60:.2f}秒！')

if __name__ == '__main__':

    main()

Python爬取 | 唯美女生图片的更多相关文章

python爬取某个网页的图片-如百度贴吧
python爬取某个网页的图片-如百度贴吧作者:vpoet mail:vpoet_sir@163.com 注:随意copy,不用告诉我 #coding:utf-8 import urllib imp ...
python爬取某个网站的图片并保存到本地
python爬取某个网站的图片并保存到本地 #coding:utf- import urllib import re import sys reload(sys) sys.setdefaultenco ...
Python 爬取陈都灵百度图片
Python 爬取陈都灵百度图片标签(空格分隔): 随笔今天意外发现了自己以前写的一篇爬虫脚本,爬取的是我的女神陈都灵,尝试运行了一下发现居然还能用.故把脚本贴出来分享一下. import req ...
python 爬取天猫美的评论数据
笔者最近迷上了数据挖掘和机器学习,要做数据分析首先得有数据才行.对于我等平民来说,最廉价的获取数据的方法,应该是用爬虫在网络上爬取数据了.本文记录一下笔者爬取天猫某商品的全过程,淘宝上面的店铺也是类似 ...
python爬取网页文本、图片
从网页爬取文本信息: eg:从http://computer.swu.edu.cn/s/computer/kxyj2xsky/中爬取讲座信息(讲座时间和讲座名称) 注:如果要爬取的内容是多页的话,网址 ...
python&colon; 爬取[博海拾贝]图片脚本
练手代码,聊作备忘: # encoding: utf-8 # from __future__ import unicode_literals import urllib import urllib2 ...
Python爬取mn52网站美女图片以及图片防盗链的解决方法
防盗链原理 http标准协议中有专门的字段记录referer 一来可以追溯上一个入站地址是什么二来对于资源文件,可以跟踪到包含显示他的网页地址是什么因此所有防盗链方法都是基于这个Referer字段 ...
python爬取并批量下载图片
import requests from lxml import etree url='http://desk.zol.com.cn/meinv/' add1='.html' urls=[] i = ...
Python&colon; 爬取百度贴吧图片
练习之代码片段,以做备忘: # encoding=utf8 from __future__ import unicode_literals import urllib, urllib2 import ...

随机推荐

ajax——CORS跨域调用REST API 的常见问题以及前后端的设置
RESTful架构是目前比较流行的一种互联网软件架构,在此架构之下的浏览器前端和手机端能共用后端接口. 但是涉及到js跨域调用接口总是很头疼,下边就跟着chrome的报错信息一起来解决一下. 假设:前 ...
zabbix报警媒介------&gt&semi;微信报警
zabbix报警媒介------>微信报警作者:尹正杰版权声明:原创作品,谢绝转载!否则将追究法律责任. 欢迎加入高级运维工程师之路:598432640 微信在我们的生活中使用的比较频繁,有 ...
MultiDex到底有多坑
google为什么要引入MultiDex? dex指令是用16位寄存器来保存dex中的方法数,所以限制了在apk 中最大的方法数为65535,当超过这个最大值在编译的时候会报方法数超标的错误. 如何 ...
DataGridView合并单元格
昨天一个同事问我DataGridView单元格合并的问题,一开始按照我的设想是算出两个单元格的Rectangle,然后直接使用e.Graphics.FillRectangle(backColorBru ...
Matlab boxplot for Multiple Groups（多组数据的箱线图）
在画之前首先介绍一下Matlab boxplot,下面这段说明内容来自http://www.plob.org/2012/06/10/2153.html 由于matlab具有强大的计算功能,用其统计 ...
ASP&period;NET Cache
ASP.NET为了方便我们访问Cache,在HttpRuntime类中加了一个静态属性Cache,这样,我们就可以在任意地方使用Cache的功能. 而且,ASP.NET还给它增加了二个“快捷方式”:P ...
使用CLRMD时通过Symbol Server找Dac的位置来初始化ClrRuntime
博客搬到了fresky.github.io - Dawei XU,请各位看官挪步.最新的一篇是:使用CLRMD时通过Symbol Server找Dac的位置来初始化ClrRuntime.
Ubuntu 启动器/快捷方式/ 制作（Eclipse为例）
首先,在路径/usr/share/applications/,中创建eclipse.desktop(如果没有的话) sudo touch /usr/share/applications/eclipse ...
linux arp攻击解决方法测试很有效
公司有台centos服务器中了arp攻击,严重影响业务,测试了很多方法都没解决,机房技术也没法处理. 通过下面方法,可以有效抵挡arp攻击. 1.环境 centos6.4 2.执行 arpin ...
Android线程和handler
根据视频仿照着写了个demo: package com.wyl.wylthreadtest; import android.graphics.Color; import android.os.Bund ...