python爬某个网站的图片

时间:2021-09-09 11:04:00
# _*_ coding: gbk _*_
import urllib
import urllib2
import re
class Spider: def getImage(self,html):
request=urllib2.Request(html);
page=urllib2.urlopen(html);
html=page.read();
pattern=r'src="http:.*\.jpg'
imglist=re.findall(pattern,html);
cnt=0
for i in imglist:
print i[5:];
urllib.urlretrieve(i[5:], 'E:\\images\%s.jpg' % cnt);
cnt+=1
if cnt==2:
break; print 'the end' if __name__=="__main__":
print 'hello'
s=Spider();
#html=r"http://baike.baidu.com/link?url=pj6QaA2Zyrxx2WcD4f7vN50LWVIZjJUKYdnnLGMOWnmInlALGH4dXmU86hE3Ar-jmaiahjf2MiEZ3n_0WCOUlFuKwVfYZNKnBwxidD1cC3i";
html=r"http://baike.baidu.com/link?url=rHaKx7RPBWuR4MxzY0BPhwbLKH4DEdwKPN8EYH-78Zzm7IMUuFTYM0eUZw-j27lHxDxyyNiqkjUg4JG8FvyjNUsuqiTzLixsNSXUtTWiOpQqrtxbf4hkj-n6gF1Nyn4D"
s.getImage(html);

  

python从某个网站上面爬很多图片的url,主要是从百度风云榜上面爬的,男演员,女演员,男歌手,女歌手,总共200张

# _*_ coding: gbk _*_
import urllib
import urllib2
import re
import os
class Spider: def getImage(self,html):
request=urllib2.Request(html);
page=urllib2.urlopen(html);
html=page.read();
pattern=r'href="http:.*简介'
imglist=re.findall(pattern,html);
with open(r'e:\\images\\paths.txt','w+') as f:
for i in imglist:
print i[6:len(i)-6];
f.write(i[6:len(i)-6]);
f.write('\n')
print len(imglist)
print 'the end' if __name__=="__main__":
print 'hello'
s=Spider();
#html=r"http://baike.baidu.com/link?url=pj6QaA2Zyrxx2WcD4f7vN50LWVIZjJUKYdnnLGMOWnmInlALGH4dXmU86hE3Ar-jmaiahjf2MiEZ3n_0WCOUlFuKwVfYZNKnBwxidD1cC3i";
html=r'http://top.baidu.com/buzz?b=18&qq-pf-to=pcqq.group'
s.getImage(html);