转载请注明出处:
http://www.cnblogs.com/darkknightzh/p/5715305.html
pubFig数据库网址:
http://www.cs.columbia.edu/CAVE/databases/pubfig/
由于版权的原因,该数据库未提供图片,只提供了图片的链接,并且某些链接已经失效。
说明:1. 某些网址需要跨越绝境长城,因而最好开代理
2. dev_urls.txt和eval_urls.txt均可在官网下载。
3. python新手,因而程序写的不好看,并且还有问题。。。
问题1:文件不存在,这个没法避免。
问题2:有时候链接某个url时,时间很长,之后会抛出异常,并提示类似下面的信息:
HTTPConnectionPool(host='www.stardepot.ca', port=): Max retries exceeded with url: /img/Miley_Cyrus_27.jpg (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x02AAC3B0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))
暂时不知道怎么解决。
__author__ = 'XXX' import os
import numpy as np
import urllib
import re # regular expression libiary
import requests
import time def findAllStrLoc(inStr, findStr):
loc = []
start = 0
while True:
curLoc = inStr.find(findStr, start)
if curLoc == -1: # if search string not found, find() returns -1
break # search is complete, break out of the while loop
start = curLoc + 1 # move to next possible start position
loc.append(curLoc)
return loc def loadData(dataPath, startLine):
datas = []
f = open(dataPath, 'r') # with open(dataPath, 'r') as f:
for line in f.readlines()[startLine:]:
# data = line.strip().split()
loc = findAllStrLoc(line, '\t')
data = []
data.append(line[0:(loc[0])]) # person # the end index of the sub str is excluded
data.append(line[loc[0]+1:loc[1]]) # imagenum
data.append(line[loc[1]+1:loc[2]]) # url
rect = line[loc[2]+1:loc[3]] # rect
rectLoc = re.findall(r'\d+', rect)
for ind in range(len(rectLoc)):
data.append(rectLoc[ind])
data.append(line[loc[3]+1:len(line)-1]) # md5sum
datas.append(data)
f.close()
return np.array(datas) # datas def createimgfolder(imgFolder):
if not os.path.isdir(imgFolder):
os.makedirs(imgFolder) def getImgNameFromURL(url):
loc = findAllStrLoc(url, '/')
imgName = url[loc[len(loc)-1]+1:]
txtName = imgName.split('.')[0] + '.txt'
return (imgName, txtName) def exists(path):
r = requests.head(path)
return r.status_code == requests.codes.ok def main():
print('loading data')
imgInfo = loadData('D:/dev_urls.txt', 2)
print('finish loading data\n') databaseFolder = 'D:/pubFig'
createimgfolder(databaseFolder) for i in range(9526, len(imgInfo)):
curtime = time.strftime('%y%m%d-%H%M%S',time.localtime())
imgFolder = databaseFolder + '/' + imgInfo[i][0]
createimgfolder(imgFolder)
url = imgInfo[i][2]
(imgName, txtName) = getImgNameFromURL(url)
try:
if exists(url):
page = urllib.urlopen(url)
img = page.read()
page.close()
imgPath = imgFolder + '/' + imgName
f = open(imgPath, "wb")
f.write(img)
f.close() txtPath = imgFolder + '/' + txtName
f = open(txtPath, "w")
for j in range(4):
f.write(imgInfo[i][j+3] + ' ')
f.close()
print('%s:%d/%d %s finish'%(curtime, i+1, len(imgInfo), url))
else:
print('%s:%d/%d %s does not exist'%(curtime, i+1, len(imgInfo), url))
except (Exception) as e:
print('%s:%d/%d %s exception %s'%(curtime, i+1, len(imgInfo), url, e)) print('finish') if __name__ == '__main__':
main()