Keras 多GPU模式在计算val的准确率时报错

时间:2022-04-28 14:01:49

错误信息:

Error: tensorflow/stream_executor/cuda/cuda_dnn.cc:444] could not convert BatchDescriptor {count: 0 feature_map_count: 32 spatial: 149 149 value_min: 0.000000 value_max: 0.000000 layout: BatchDepthYX} to cudnn tensor descriptor: CUDNN_STATUS_BAD_PARAM

github上的解决方法
这个是由于keras源码的问题:
keras.preprocessing.image.py中我们可以看到:

class Iterator(Sequence):
    """Base class for image data iterators. Every `Iterator` must implement the `_get_batches_of_transformed_samples` method. # Arguments n: Integer, total number of samples in the dataset to loop over. batch_size: Integer, size of a batch. shuffle: Boolean, whether to shuffle the data between epochs. seed: Random seeding for data shuffling. """

    def __init__(self, n, batch_size, shuffle, seed):
        self.n = n
        self.batch_size = batch_size
        self.seed = seed
        self.shuffle = shuffle
        self.batch_index = 0
        self.total_batches_seen = 0
        self.lock = threading.Lock()
        self.index_array = None
        self.index_generator = self._flow_index()

    def _set_index_array(self):
        self.index_array = np.arange(self.n)
        if self.shuffle:
            self.index_array = np.random.permutation(self.n)

    def __getitem__(self, idx):
        if idx >= len(self):
            raise ValueError('Asked to retrieve element {idx}, '
                             'but the Sequence '
                             'has length {length}'.format(idx=idx,
                                                          length=len(self)))
        if self.seed is not None:
            np.random.seed(self.seed + self.total_batches_seen)
        self.total_batches_seen += 1
        if self.index_array is None:
            self._set_index_array()
        index_array = self.index_array[self.batch_size * idx:
                                       self.batch_size * (idx + 1)]
        return self._get_batches_of_transformed_samples(index_array)

    def __len__(self):
        return (self.n + self.batch_size - 1) // self.batch_size  # round up

......
......
...

这里的len是向上取整的。
这导致了有些时候是没有数据的。
因此解决方法是:

  1. 保证数据量大小n与batchsize满足: n % batch_size > num_gpu
  2. 不改源码的话保证数据量永远能被可能的gpu数量整除:

    possible_num_gpus = [1,2,3,4,8]
    lcm = lcmm(*possible_num_gpus) # lcm == 24
    assert iterator.n % lcm == 0 
    
  3. 或者更改源码:
    def __init__(self, n, batch_size, shuffle, seed):
        self.n = n
        self.batch_size = batch_size
        self.seed = seed
        self.shuffle = shuffle
        self.batch_index = 0
        self.total_batches_seen = 0
        self.lock = threading.Lock()
        self.index_array = None
        self.index_generator = self._flow_index()
        self.excess_elements = self.n % self.batch_size

    def _set_index_array(self):
        self.index_array = np.arange(self.n)
        if self.shuffle:
            # self.index_array = np.random.permutation(self.n)
            self.index_array = np.random.permutation(self.n)[:self.n - self.excess_elements]
        else:
            self.index_array = self.index_array[: self.n - self.excess_elements]

    def __len__(self):
        #return (self.n + self.batch_size - 1) // self.batch_size # round up
        return (self.n) // self.batch_size  # round down

    def _flow_index(self):
        # Ensure self.batch_index is 0.
        self.reset()
        while 1:
            if self.seed is not None:
                np.random.seed(self.seed + self.total_batches_seen)
            if self.batch_index == 0:
                self._set_index_array()

            current_index = (self.batch_index * self.batch_size) % self.n
            if self.n > current_index + self.batch_size + self.excess_elements:
                self.batch_index += 1
            else:
                self.batch_index = 0
            self.total_batches_seen += 1
            yield self.index_array[current_index:
                                   current_index + self.batch_size]