在crnn训练的时候需要用到lmdb格式的数据集,下面是python生成lmdb个是数据集的代码,注意一定要在linux系统下,否则会读入图像的时候出问题,可能遇到的问题都在代码里面注释了,看代码即可。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#-*- coding:utf-8 -*-
import os
import lmdb #先pip install这个模块哦
import cv2
import glob
import numpy as np
def checkImageIsValid(imageBin):
if imageBin is None :
return False
imageBuf = np.fromstring(imageBin, dtype = np.uint8)
img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
if img is None :
return False
imgH, imgW = img.shape[ 0 ], img.shape[ 1 ]
if imgH * imgW = = 0 :
return False
return True
def writeCache(env, cache):
with env.begin(write = True ) as txn:
for k, v in cache.iteritems():
txn.put(k, v)
def createDataset(outputPath, imagePathList, labelList, lexiconList = None , checkValid = True ):
"""
Create LMDB dataset for CRNN training.
# ARGS:
outputPath : LMDB output path
imagePathList : list of image path
labelList : list of corresponding groundtruth texts
lexiconList : (optional) list of lexicon lists
checkValid : if true, check the validity of every image
"""
# print (len(imagePathList) , len(labelList))
assert ( len (imagePathList) = = len (labelList))
nSamples = len (imagePathList)
print '...................'
env = lmdb. open (outputPath, map_size = 8589934592 ) #1099511627776)所需要的磁盘空间的最小值,之前是1T,我改成了8g,否则会报磁盘空间不足,这个数字是字节
cache = {}
cnt = 1
for i in xrange (nSamples):
imagePath = imagePathList[i]
label = labelList[i]
if not os.path.exists(imagePath):
print ( '%s does not exist' % imagePath)
continue
with open (imagePath, 'r' ) as f:
imageBin = f.read()
if checkValid:
if not checkImageIsValid(imageBin):
print ( '%s is not a valid image' % imagePath) #注意一定要在linux下,否则f.read就不可用了,就会输出这个信息
continue
imageKey = 'image-%09d' % cnt
labelKey = 'label-%09d' % cnt
cache[imageKey] = imageBin
cache[labelKey] = label
if lexiconList:
lexiconKey = 'lexicon-%09d' % cnt
cache[lexiconKey] = ' ' .join(lexiconList[i])
if cnt % 1000 = = 0 :
writeCache(env, cache)
cache = {}
print ( 'Written %d / %d' % (cnt, nSamples))
cnt + = 1
nSamples = cnt - 1
cache[ 'num-samples' ] = str (nSamples)
writeCache(env, cache)
print ( 'Created dataset with %d samples' % nSamples)
def read_text(path):
with open (path) as f:
text = f.read()
text = text.strip()
return text
if __name__ = = '__main__' :
# lmdb 输出目录
outputPath = 'D:/ruanjianxiazai/tuxiangyangben/fengehou/train' #训练集和验证集要跑两遍这个程序,分两次生成
path = "D:/ruanjianxiazai/tuxiangyangben/fengehou/chenguang/*.jpg" #将txt与jpg的都放在同一个文件里面
imagePathList = glob.glob(path)
print '------------' , len (imagePathList), '------------'
imgLabelLists = []
for p in imagePathList:
try :
imgLabelLists.append((p, read_text(p.replace( '.jpg' , '.txt' ))))
except :
continue
# imgLabelList = [ (p, read_text(p.replace('.jpg', '.txt'))) for p in imagePathList]
# sort by labelList
imgLabelList = sorted (imgLabelLists, key = lambda x: len (x[ 1 ]))
imgPaths = [ p[ 0 ] for p in imgLabelList]
txtLists = [ p[ 1 ] for p in imgLabelList]
createDataset(outputPath, imgPaths, txtLists, lexiconList = None , checkValid = True )
|
以上这篇python生成lmdb格式的文件实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/dulingtingzi/article/details/79585180