Python配合BeautifulSoup读取网络图片并保存在本地

本例为Python配合BeautifulSoup读取网络图片，并保存在本地。

BeautifulSoup可代替正则表达式，更好地解析Html文本，获取其中的指定内容，如Tag、Property等

# -*- coding: gbk -*-

import urllib

import urllib2

from bs4 import BeautifulSoup

import time

import re

import os,sys

import chardet

def req(url):

    #url='http://www.szu.edu.cn/2014/news/index_1.html'

    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

    req=urllib2.Request(url,headers=header)

    data=urllib.urlopen(req).read()

    print data

    return data

def reqImg():

    #url='http://www.junmeng.com/tj/22376_4.html'

    url=r'http://www.junmeng.com/tj/22376.html'

    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

    patnLink=r'<a href=".*/tj/22376_\d*.html"><img src.+</a>'

    patnImg=r'<img src=.+>'

    savedir=r'C:\Users\hp\Desktop\results'

    if not os.path.exists(savedir):

            os.mkdir(savedir)

    for i in range(1,20):

        if i==1:

            tempurl=url

        else:

            tempurl='http://www.junmeng.com/tj/22376_%d.html'%i

        print tempurl

        #req=Request(tempurl,headers=header)

        data=urllib.urlopen(tempurl).read()

        #print data

        if i==19:

            patnLink=r'<a href=.*><img src=.*</a>'

        imgLinks=re.findall(patnLink,data)

        #print results

        link=imgLinks[0]

        #print link

        imgLink=link[link.find('src=')+5:link.find('.jpg')+4]

        print imgLink

        fullLink=r'http://www.junmeng.com%s'%imgLink

        lct=time.strftime('%Y%m%d%H%M%S')

        urllib.urlretrieve(fullLink,'%s\%s%d.jpg'%(savedir,lct,i))

        #return data

def reqImg2():

    url=r'http://www.ik6.com/meinv/40569/index.html'

    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

    savedir=r'C:\Users\hp\Desktop\results'

    if not os.path.exists(savedir):

            os.mkdir(savedir)

    for i in range(1,10):

        if i==1:

            tempurl=url

        else:

            tempurl='http://www.ik6.com/meinv/40569/index_%d.html'%i

        print tempurl

        #req=Request(tempurl,headers=header)

        data=urllib.urlopen(tempurl).read()

        page=BeautifulSoup(data)

        imgsrc=page.find_all('center')[0].find_all('img')[0].get('lazysrc')

        print imgsrc

        lct=time.strftime('%Y%m%d%H%M%S')

        urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i))

def reqImg3():

    url=r'http://www.ik6.com/meinv/40572/index.html'

    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

    savedir=r'C:\Users\hp\Desktop\results'

    if not os.path.exists(savedir):

            os.mkdir(savedir)

    for i in range(1,10):

        if i==1:

            tempurl=url

        else:

            tempurl='http://www.ik6.com/meinv/40572/index_%d.html'%i

        print tempurl

        #req=Request(tempurl,headers=header)

        data=urllib.urlopen(tempurl).read()

        page=BeautifulSoup(data)

        imgsrc=page.find_all('center')[0].find_all('img')[0].get('lazysrc')

        print imgsrc

        lct=time.strftime('%Y%m%d%H%M%S')

        urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i))

def reqImg4(url,themecount,imgcount):

    #url=r'http://www.ik6.com/meinv/40572/index.html'

    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

    savedir=r'C:\Users\hp\Desktop\result0128'

    if not os.path.exists(savedir):

            os.mkdir(savedir)

    newUrl=(url[:url.rfind('.htm')]+'_%d.html')

    print newUrl

    for i in range(1,imgcount+1):

        if i==1:

            tempurl=url

        else:

            tempurl=newUrl%i

        print tempurl

        try:

            data=urllib.urlopen(tempurl).read()

            if not data:

                print 'no response,exit'

                return

            page=BeautifulSoup(data)

            centers=page.find_all('center')

            if len(centers)==0:

                print 'response has no contents,exit'

                return

            else:

                imgsrc=centers[0].find_all('img')[0].get('lazysrc')

                print imgsrc

                #lct=time.strftime('%Y%m%d%H%M%S')

                #urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i))

                urllib.urlretrieve(imgsrc,'%s\%d_%d.jpg'%(savedir,themecount,i))

        except Exception,e:

            return

使用：

req('http://blog.csdn.net/suwei19870312/article/details/8148427')

req('http://www.taobao.com')

reqImg()

reqImg2()

reqImg3()

for i in range(1000):

    count=11170+i

    url=r'http://www.ik6.com/meinv/%d/index.html'%count

    reqImg4(url,8)