3. 下载图片
[root@node1 python]# mkdir image[root@node1 python]# cd image
[root@node1 python]# vim getHtml.py
#!/usr/bin/python
import re
import urllib
def getHtml(url):
html = urllib.urlopen(url)
scode = html.read()
return scode
print getHtml('http://tieba.baidu.com/p/1762577651')
第二步,获取图片相关地址(正则匹配)
从取回的源代码中分析图片相关URL 的构造,然后通过正则表达式将图片地址提取出来 源文件中图片的标签是这样子的:<img class="BDE_Image" src="http://imgsrc.baidu.com/forum/w%3D580/sign=2e8f3ca53af33a879e6d0012f65d1018/4ece3bc79f3df8dc2ab63004cd11728b46102899.jpg" width="560" height="400" changedsize="true">
要获取的是http://imgsrc.baidu.com/xxxxxxx.jpg
#!/usr/bin/pythonimport reimport urllib def getHtml(url): html = urllib.urlopen(url) scode = html.read() return scode def getImage(source): re = r'src="(.*?\.jpg)" width=' imgre = re.compile(re) images = re.findall(imgre,source) return images source = getHtml('http://tieba.baidu.com/p/1762577651')print getImage(source)
#!/usr/bin/pythonimport reimport urllib def getHtml(url): html = urllib.urlopen(url) scode = html.read() return scode def getImage(source): re = r'src="(.*?\.jpg)" width=' imgre = re.compile(re) images = re.findall(imgre,source) for i in images: urllib.urlretrieve(i,'1.jpg') source = getHtml('http://tieba.baidu.com/p/1762577651')print getImage(source)
#!/usr/bin/pythonimport reimport urllib def getHtml(url): html = urllib.urlopen(url) scode = html.read() return scode def getImage(source): re = r'src="(.*?\.jpg)" width=' imgre = re.compile(re) images = re.findall(imgre,source) x = 0 for i in images: urllib.urlretrieve(i,'%s.jpg' % x) x+=1 source = getHtml('http://tieba.baidu.com/p/1762577651')print getImage(source)
[root@node1 image]# python getHtml.py[root@node1 image]# ls11.jpg 13.jpg 15.jpg 17.jpg 19.jpg 20.jpg 3.jpg 5.jpg 7.jpg 9.jpg 10.jpg12.jpg 14.jpg 16.jpg 18.jpg 1.jpg 2.jpg 4.jpg 6.jpg 8.jpg getHtml.py