python 百度图片爬虫

# -*- coding:utf-8 -*-

#https://blog.****.net/qq_32166627/article/details/60882964

import requests

import os

import pinyin

import simplejson

def getManyPages(keyword,pages):

    params=[]

    for i in range(30,30*pages+30,30):

        params.append({

                      'tn': 'resultjson_com',

                      'ipn': 'rj',

                      'ct': 201326592,

                      'is': '',

                      'fp': 'result',

                      'queryWord': keyword,

                      'cl': 2,

                      'lm': -1,

                      'ie': 'utf-8',

                      'oe': 'utf-8',

                      'adpicid': '',

                      'st': -1,

                      'z': '',

                      'ic': 0,

                      'word': keyword,

                      's': '',

                      'se': '',

                      'tab': '',

                      'width': '',

                      'height': '',

                      'face': 0,

                      'istype': 2,

                      'qc': '',

                      'nc': 1,

                      'fr': '',

                      'pn': i,

                      'rn': 30,

                      'gsm': '1e',

                      '': ''

                  })

    url = 'https://image.baidu.com/search/acjson'

    urls = []

    for i in params:

        #print("begin")

        try:

            rgjson = requests.get(url,params=i).json().get('data')

        except simplejson.scanner.JSONDecodeError:

            print('【错误】simplejson.scanner.JSONDecodeError ')

            continue

        #print("end")

        urls.append(rgjson)

    return urls

def getImg(dataList, localPath, keyword):

    if not os.path.exists(localPath):  # 新建文件夹

        os.mkdir(localPath)

    x = 0

    for list in dataList:

        for i in list:

            if i.get('thumbURL') != None:

                #print('download：%s' % i.get('thumbURL'))

                print("down " + str(x) + " image " + i.get('thumbURL'))

                ir = requests.get(i.get('thumbURL'))

                open(localPath +"/" + keyword +  '_%d.jpg' % x, 'wb').write(ir.content)

                x += 1

            else:

                print('image not exist')

def convert():

    fp = open("stars_list_clean.txt",'w')

    with open("stars_list.txt",'r') as face_file:

        stars_list = face_file.readlines()

        index = 0

        line_record = []

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            #print(line)

            line_split = line.strip().split(",")

            print(line_split[1])

            if line_split[1] not in line_record:

                line_record.append(line_split[1])

                fp.write('%s\n' % line_split[1])

            else:

                print(line_split[1], " is exist")

def debug():

    # with open("stars_list_clean.txt",'r') as face_file:

    #   stars_list = face_file.readlines()

    #   index = 0

    #   for line in stars_list:

    #       line = line.replace('\r','').replace('\n','').replace('\t','')

    #       keyword_english = pinyin.get(line, format="strip")

    #       keyword = line

    #       index += 1

    #       if index > 0:

    #         break

    # print(keyword)

    # keyword1 = '胡因梦'

    # if keyword == keyword1:

    #     print("yes")

    # else:

    #     print("no")

    keyword = '胡因梦'

    keyword_english = "hym"

    dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    getImg(dataList,'./hanxue', keyword_english) # 参数2:指定保存的路径

    # keyword = '韩雪'

    # dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    #getImg(dataList,'./hanxue') # 参数2:指定保存的路径

def run():

    fp = open("stars_list_en.txt",'w')

    with open("stars_list_clean.txt",'r') as face_file:

        stars_list = face_file.readlines()

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            keyword_english = pinyin.get(line, format="strip")

            fp.write('%s\n' % keyword_english)

    face_ID_index = 0

    dir = "./stars_srcimg/"

    # if os.path.exists(dir):

    #     os.system("rm -rf " + dir)

    if not os.path.exists(dir):

        os.mkdir(dir)

    pages = 5

    maxnum = pages * 30

    print(maxnum)

    for line in stars_list:

        #line.decode('utf-8').encode('gb2312')

        line = line.replace('\r','').replace('\n','').replace('\t','')

        keyword = line

        print keyword

        keyword_english = pinyin.get(keyword, format="strip")

        print keyword_english

        face_ID = str(face_ID_index) + "_" + keyword

        facesavepath = dir + str(face_ID_index) + "_" + keyword

        face_ID_index += 1

        print facesavepath

        if not os.path.exists(facesavepath):

            os.mkdir(facesavepath)

        else:

            print(keyword, " exist")

            continue

        print("down "  + keyword)

        dataList = getManyPages(keyword, pages)  # 参数1:关键字，参数2:要下载的页数

        getImg(dataList, facesavepath, face_ID) # 参数2:指定保存的路径

if __name__ == '__main__':

  debug()

  #run()
秒客网

python 百度图片爬虫

相关文章