最基本的抓取网页内容的代码实现:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#!/usr/bin/env python
from urllib import urlretrieve
def firstNonBlank(lines):
for eachLine in lines:
if not eachLine.strip():
continue
else :
return eachLine
def firstLast(webpage):
f = open (webpage)
lines = f.readlines()
f.close()
print firstNonBlank(lines),
lines.reverse()
print firstNonBlank(lines),
def download(url = 'http://www' ,process = firstLast):
try :
retval = urlretrieve(url)[ 0 ]
except IOError:
retval = None
if retval:
process(retval)
if __name__ = = '__main__' :
download()
|
利用urllib模块,来实现一个网页中针对图片的抓取功能:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import urllib.request
import socket
import re
import sys
import os
targetDir = r "C:\Users\elqstux\Desktop\pic"
def destFile(path):
if not os.path.isdir(targetDir):
os.mkdir(targetDir)
pos = path.rindex( '/' )
t = os.path.join(targetDir, path[pos + 1 :])
return t
if __name__ = = "__main__" :
hostname = "http://www.douban.com"
req = urllib.request.Request(hostname)
webpage = urllib.request.urlopen(req)
contentBytes = webpage.read()
for link, t in set (re.findall(r '(http:[^\s]*?(jpg|png|gif))' , str (contentBytes))):
print (link)
urllib.request.urlretrieve(link, destFile(link))
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
import urllib.request
import socket
import re
import sys
import os
targetDir = r "H:\pic"
def destFile(path):
if not os.path.isdir(targetDir):
os.mkdir(targetDir)
pos = path.rindex( '/' )
t = os.path.join(targetDir, path[pos + 1 :]) #会以/作为分隔
return t
if __name__ = = "__main__" :
hostname = "http://www.douban.com/"
req = urllib.request.Request(hostname)
webpage = urllib.request.urlopen(req)
contentBytes = webpage.read()
match = re.findall(r '(http:[^\s]*?(jpg|png|gif))' , str (contentBytes) ) #r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号,故有两个分组,
#上面会返回列表,括号中匹配的内容才会出现在列表中
for picname, picType in match:
print (picname)
print (picType)
'''''
输出:
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g111328-1.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g197523-19.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
...
'''
|