PyTorch提供了ImageFolder的类来加载文件结构如下的图片数据集:
1
2
3
4
5
6
7
|
root / dog / xxx.png
root / dog / xxy.png
root / dog / xxz.png
root / cat / 123.png
root / cat / nsdf3.png
root / cat / asd932_.png
|
使用这个类的问题在于无法将训练集(training dataset)和验证集(validation dataset)分开。我写了两个类来完成这个工作。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, Resize, Compose
from PIL import Image
from sklearn.model_selection import train_test_split
class ImageFolderSplitter:
# images should be placed in folders like:
# --root
# ----root\dogs
# ----root\dogs\image1.png
# ----root\dogs\image2.png
# ----root\cats
# ----root\cats\image1.png
# ----root\cats\image2.png
# path: the root of the image folder
def __init__( self , path, train_size = 0.8 ):
self .path = path
self .train_size = train_size
self .class2num = {}
self .num2class = {}
self .class_nums = {}
self .data_x_path = []
self .data_y_label = []
self .x_train = []
self .x_valid = []
self .y_train = []
self .y_valid = []
for root, dirs, files in os.walk(path):
if len (files) = = 0 and len (dirs) > 1 :
for i, dir1 in enumerate (dirs):
self .num2class[i] = dir1
self .class2num[dir1] = i
elif len (files) > 1 and len (dirs) = = 0 :
category = ""
for key in self .class2num.keys():
if key in root:
category = key
break
label = self .class2num[category]
self .class_nums[label] = 0
for file1 in files:
self .data_x_path.append(os.path.join(root, file1))
self .data_y_label.append(label)
self .class_nums[label] + = 1
else :
raise RuntimeError( "please check the folder structure!" )
self .x_train, self .x_valid, self .y_train, self .y_valid = train_test_split( self .data_x_path, self .data_y_label, shuffle = True , train_size = self .train_size)
def getTrainingDataset( self ):
return self .x_train, self .y_train
def getValidationDataset( self ):
return self .x_valid, self .y_valid
class DatasetFromFilename(Dataset):
# x: a list of image file full path
# y: a list of image categories
def __init__( self , x, y, transforms = None ):
super (DatasetFromFilename, self ).__init__()
self .x = x
self .y = y
if transforms = = None :
self .transforms = ToTensor()
else :
self .transforms = transforms
def __len__( self ):
return len ( self .x)
def __getitem__( self , idx):
img = Image. open ( self .x[idx])
img = img.convert( "RGB" )
return self .transforms(img), torch.tensor([[ self .y[idx]]])
# test code
# splitter = ImageFolderSplitter("for_test")
# transforms = Compose([Resize((51, 51)), ToTensor()])
# x_train, y_train = splitter.getTrainingDataset()
# training_dataset = DatasetFromFilename(x_train, y_train, transforms=transforms)
# training_dataloader = DataLoader(training_dataset, batch_size=2, shuffle=True)
# x_valid, y_valid = splitter.getValidationDataset()
# validation_dataset = DatasetFromFilename(x_valid, y_valid, transforms=transforms)
# validation_dataloader = DataLoader(validation_dataset, batch_size=2, shuffle=True)
# for x, y in training_dataloader:
# print(x.shape, y.shape)
|
更多的代码可以在我的Github reop下找到。
原文链接:https://blog.csdn.net/xgbm_k/article/details/84325347