在上一篇的博客讲述了SSD的原理,这一篇主要是讲解keras的实现。
keras代码的github地址为:点击打开链接
model 的框架实现(ssd.py):
先给出了改变后的VGG16的实现:
- def SSD300(input_shape, num_classes=21):
- #Input_shape 为输入的形状(300,300,3)
- #num_class 为需要检测的种类。
- # Block 1
- input_tensor = input_tensor = Input(shape=input_shape)
- img_size = (input_shape[1], input_shape[0])
- net[\'input\'] = input_tensor
- net[\'conv1_1\'] = Convolution2D(64, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv1_1\')(net[\'input\'])
- net[\'conv1_2\'] = Convolution2D(64, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv1_2\')(net[\'conv1_1\'])
- net[\'pool1\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
- name=\'pool1\')(net[\'conv1_2\'])
- # Block 2
- net[\'conv2_1\'] = Convolution2D(128, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv2_1\')(net[\'pool1\'])
- net[\'conv2_2\'] = Convolution2D(128, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv2_2\')(net[\'conv2_1\'])
- net[\'pool2\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
- name=\'pool2\')(net[\'conv2_2\'])
- # Block 3
- net[\'conv3_1\'] = Convolution2D(256, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv3_1\')(net[\'pool2\'])
- net[\'conv3_2\'] = Convolution2D(256, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv3_2\')(net[\'conv3_1\'])
- net[\'conv3_3\'] = Convolution2D(256, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv3_3\')(net[\'conv3_2\'])
- net[\'pool3\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
- name=\'pool3\')(net[\'conv3_3\'])
- # Block 4
- net[\'conv4_1\'] = Convolution2D(512, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv4_1\')(net[\'pool3\'])
- net[\'conv4_2\'] = Convolution2D(512, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv4_2\')(net[\'conv4_1\'])
- net[\'conv4_3\'] = Convolution2D(512, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv4_3\')(net[\'conv4_2\'])
- net[\'pool4\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
- name=\'pool4\')(net[\'conv4_3\'])
- # Block 5
- net[\'conv5_1\'] = Convolution2D(512, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv5_1\')(net[\'pool4\'])
- net[\'conv5_2\'] = Convolution2D(512, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv5_2\')(net[\'conv5_1\'])
- net[\'conv5_3\'] = Convolution2D(512, 3, 3,
- activation=\'relu\',
- border_mode=\'same\',
- name=\'conv5_3\')(net[\'conv5_2\'])
- net[\'pool5\'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode=\'same\',
- name=\'pool5\')(net[\'conv5_3\'])
- # FC6
- net[\'fc6\'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6),
- activation=\'relu\', border_mode=\'same\',
- name=\'fc6\')(net[\'pool5\'])
- # FC7
- net[\'fc7\'] = Convolution2D(1024, 1, 1, activation=\'relu\',
- border_mode=\'same\', name=\'fc7\')(net[\'fc6\'])
- # Block 6
- net[\'conv6_1\'] = Convolution2D(256, 1, 1, activation=\'relu\',
- border_mode=\'same\',
- name=\'conv6_1\')(net[\'fc7\'])
- net[\'conv6_2\'] = Convolution2D(512, 3, 3, subsample=(2, 2),
- activation=\'relu\', border_mode=\'same\',
- name=\'conv6_2\')(net[\'conv6_1\'])
- # Block 7
- net[\'conv7_1\'] = Convolution2D(128, 1, 1, activation=\'relu\',
- border_mode=\'same\',
- name=\'conv7_1\')(net[\'conv6_2\'])
- net[\'conv7_2\'] = ZeroPadding2D()(net[\'conv7_1\'])
- net[\'conv7_2\'] = Convolution2D(256, 3, 3, subsample=(2, 2),
- activation=\'relu\', border_mode=\'valid\',
- name=\'conv7_2\')(net[\'conv7_2\'])
- # Block 8
- net[\'conv8_1\'] = Convolution2D(128, 1, 1, activation=\'relu\',
- border_mode=\'same\',
- name=\'conv8_1\')(net[\'conv7_2\'])
- net[\'conv8_2\'] = Convolution2D(256, 3, 3, subsample=(2, 2),
- activation=\'relu\', border_mode=\'same\',
- name=\'conv8_2\')(net[\'conv8_1\'])
- # Last Pool
- net[\'pool6\'] = GlobalAveragePooling2D(name=\'pool6\')(net[\'conv8_2\'])
标红部分就是进行改变的部分,可以看出把FC6换成了空洞卷积,和普通卷积差不多,就是把一次卷积的感受域扩大了。FC7换成了普通卷积,之后再添加了几个卷积块。
接下来就是通过改变后的VGG16得到的多层feature map来预测location 和 confidence。使用到的feature map 有:conv4_3、fc7、conv6_2、conv7_2、conv8_2、pool6。总共6层的feature map。因为对于每层的处理步骤差不多,所以就贴出conv4_3处理的代码:
- # Prediction from conv4_3
- net[\'conv4_3_norm\'] = Normalize(20, name=\'conv4_3_norm\')(net[\'conv4_3\'])
- num_priors = 3
- x = Convolution2D(num_priors * 4, 3, 3, border_mode=\'same\',
- name=\'conv4_3_norm_mbox_loc\')(net[\'conv4_3_norm\'])
- net[\'conv4_3_norm_mbox_loc\'] = x
- flatten = Flatten(name=\'conv4_3_norm_mbox_loc_flat\')
- net[\'conv4_3_norm_mbox_loc_flat\'] = flatten(net[\'conv4_3_norm_mbox_loc\'])
- name = \'conv4_3_norm_mbox_conf\'
- if num_classes != 21:
- name += \'_{}\'.format(num_classes)
- x = Convolution2D(num_priors * num_classes, 3, 3, border_mode=\'same\',
- name=name)(net[\'conv4_3_norm\'])
- net[\'conv4_3_norm_mbox_conf\'] = x
- flatten = Flatten(name=\'conv4_3_norm_mbox_conf_flat\')
- net[\'conv4_3_norm_mbox_conf_flat\'] = flatten(net[\'conv4_3_norm_mbox_conf\'])
- priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2],
- variances=[0.1, 0.1, 0.2, 0.2],
- name=\'conv4_3_norm_mbox_priorbox\')
- net[\'conv4_3_norm_mbox_priorbox\'] = priorbox(net[\'conv4_3_norm\'])
可以看出对于conv4_3这层的feature map,采用的default box 的个数为3。所以location预测这个卷积层使用的卷积核个数为:3*4=12个。卷积完之后进行flatten,因为最后的输出是多层feature map预测的concatenate。同理,对于confidence预测采用的卷积核个数为:21*3=36(对于voc数据集而言)。对于PriorBox这一层,目前只需要知道它是对feature map 进行相应的操作,来得到default box的,而且对于特定的一层feature map而言,它是固定不变的,不随train或者predict的过程改变的。
对于pool6产生的feature map处理有一些不一样,这里单独的拿出来说一下,因为pool6层使用的是globa laverage pool,所以它输出的大小为1*1*256,比较小,不太适合用卷积处理了,就直接用Dense层来处理了:
- # Prediction from pool6
- num_priors = 6
- x = Dense(num_priors * 4, name=\'pool6_mbox_loc_flat\')(net[\'pool6\'])
- net[\'pool6_mbox_loc_flat\'] = x
- name = \'pool6_mbox_conf_flat\'
- if num_classes != 21:
- name += \'_{}\'.format(num_classes)
- x = Dense(num_priors * num_classes, name=name)(net[\'pool6\'])
- net[\'pool6_mbox_conf_flat\'] = x
- priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3],
- variances=[0.1, 0.1, 0.2, 0.2],
- name=\'pool6_mbox_priorbox\')
- if K.image_dim_ordering() == \'tf\':
- target_shape = (1, 1, 256)
- else:
- target_shape = (256, 1, 1)
- net[\'pool6_reshaped\'] = Reshape(target_shape,
- name=\'pool6_reshaped\')(net[\'pool6\'])
- net[\'pool6_mbox_priorbox\'] = priorbox(net[\'pool6_reshaped\'])
每层预测完事之后呢,当然是把他们都给concatenate起来,就贴location的实现,其他两个类似:
- net[\'mbox_loc\'] = merge([net[\'conv4_3_norm_mbox_loc_flat\'],
- net[\'fc7_mbox_loc_flat\'],
- net[\'conv6_2_mbox_loc_flat\'],
- net[\'conv7_2_mbox_loc_flat\'],
- net[\'conv8_2_mbox_loc_flat\'],
- net[\'pool6_mbox_loc_flat\']],
- mode=\'concat\', concat_axis=1, name=\'mbox_loc\')
因为之前进行了flatten,所以concatenate得到的是一个batch中每个sample所有的location位置,并且是一个一维的形式存在,需要把它给重新reshape成[batch, number of default box, 4 ]的形式;预测的class分类也是类似的:[batch, number of default box, 21 ]。最后再将location、class、default box三者进行merge得到最终的预测结果。
- #计算default box 的个数
- if hasattr(net[\'mbox_loc\'], \'_keras_shape\'):
- num_boxes = net[\'mbox_loc\']._keras_shape[-1] // 4
- elif hasattr(net[\'mbox_loc\'], \'int_shape\'):
- num_boxes = K.int_shape(net[\'mbox_loc\'])[-1] // 4
- net[\'mbox_loc\'] = Reshape((num_boxes, 4),
- name=\'mbox_loc_final\')(net[\'mbox_loc\'])
- net[\'mbox_conf\'] = Reshape((num_boxes, num_classes),
- name=\'mbox_conf_logits\')(net[\'mbox_conf\'])
- net[\'mbox_conf\'] = Activation(\'softmax\',
- name=\'mbox_conf_final\')(net[\'mbox_conf\'])
- net[\'predictions\'] = merge([net[\'mbox_loc\'],
- net[\'mbox_conf\'],
- net[\'mbox_priorbox\']],
- mode=\'concat\', concat_axis=2,
- name=\'predictions\')
我们来计算一下这六层feature map总共拥有的default box的数量:38*38*3+19*19*6+10*10*6+5*5*6+3*3*6+1*1*6=7308。和论文中还是存在一定的差别的。
接一下就是介绍一下model中使用到的PriorBox层的作用。它是作用在每一层的feature map上的,根据输入的不同aspect ratio 和 scale 以及 num_prior来返回特定的default box,default box 的数目是feature map的height*width*num_prior。具体看代码:
- class PriorBox(Layer):
- \'\'\'
- img_size: 输入图片的大小(w, h).
- min_size: 每个feature cell中最小的scale,不是归一化后的值,而是实际的大小
- max_size: 每个feature cell中最大的scale,不是归一化的值,而是实际的大小
- aspect_ratios: 长宽比
- flip:是否需要对长宽比进行反转。
- variances: 添加的方差x,y,w,h
- clip: 让输出保持在[0,1之间
- 输入的shape:
- `4D的tensor:(samples, rows, cols, channels)
- 输出的shape:
- 3D的tensor:(samples, num_boxes, 8)
- 其中的8具体为:(xmin, ymin, xmax, ymax, variance[0], variance[1], variance[2], variance[3])
- """
- def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
- flip=True, variances=[0.1], clip=True, **kwargs):
- self.waxis = 2
- self.haxis = 1
- self.img_size = img_size
- if min_size <= 0:
- raise Exception(\'min_size must be positive.\')
- self.min_size = min_size
- self.max_size = max_size
- self.aspect_ratios = [1.0]
- if max_size:
- if max_size < min_size:
- raise Exception(\'max_size must be greater than min_size.\')
- self.aspect_ratios.append(1.0)
- #根据给定的aspect_ratio来计算全部的aspect ratio
- if aspect_ratios:
- for ar in aspect_ratios:
- if ar in self.aspect_ratios:
- continue
- self.aspect_ratios.append(ar)
- if flip:
- self.aspect_ratios.append(1.0 / ar)
- self.variances = np.array(variances)
- self.clip = True
- super(PriorBox, self).__init__(**kwargs)
- #用于返回自定义层的输出shape
- def compute_output_shape(self, input_shape):
- num_priors_ = len(self.aspect_ratios)
- layer_width = input_shape[self.waxis]
- layer_height = input_shape[self.haxis]
- num_boxes = num_priors_ * layer_width * layer_height
- return (input_shape[0], num_boxes, 8)
- def call(self, x, mask=None):
- if hasattr(x, \'_keras_shape\'):
- input_shape = x._keras_shape
- elif hasattr(K, \'int_shape\'):
- input_shape = K.int_shape(x)
- layer_width = input_shape[self.waxis]
- layer_height = input_shape[self.haxis]
- img_width = self.img_size[0]
- img_height = self.img_size[1]
- # define prior boxes shapes
- box_widths = []
- box_heights = []
- for ar in self.aspect_ratios:
- if ar == 1 and len(box_widths) == 0:
- box_widths.append(self.min_size)
- box_heights.append(self.min_size)
- elif ar == 1 and len(box_widths) > 0:
- box_widths.append(np.sqrt(self.min_size * self.max_size))
- box_heights.append(np.sqrt(self.min_size * self.max_size))
- elif ar != 1:
- box_widths.append(self.min_size * np.sqrt(ar))
- box_heights.append(self.min_size / np.sqrt(ar))
- box_widths = 0.5 * np.array(box_widths)
- box_heights = 0.5 * np.array(box_heights)
- # define centers of prior boxes
- step_x = img_width / layer_width
- step_y = img_height / layer_height
- #用于产生default box的中心坐标
- linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
- layer_width)
- liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
- layer_height)
- centers_x, centers_y = np.meshgrid(linx, liny)
- centers_x = centers_x.reshape(-1, 1)
- centers_y = centers_y.reshape(-1, 1)
- # define xmin, ymin, xmax, ymax of prior boxes
- num_priors_ = len(self.aspect_ratios)
- #concatenate之后得到了一连串的(centers_x,centers_y)形式的坐标
- prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
- #扩充得到(centers_x, centers_y, centers_x, centers_y)形式的坐标
- prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
- prior_boxes[:, ::4] -= box_widths
- prior_boxes[:, 1::4] -= box_heights
- prior_boxes[:, 2::4] += box_widths
- prior_boxes[:, 3::4] += box_heights
- prior_boxes[:, ::2] /= img_width
- prior_boxes[:, 1::2] /= img_height
- #最终得到各个default box的归一化后的(Xmin,Ymin, Xmax, Ymax)
- #reshape成[num_box, 4]的形式
- prior_boxes = prior_boxes.reshape(-1, 4)
- if self.clip:
- prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
- # define variances
- num_boxes = len(prior_boxes)
- if len(self.variances) == 1:
- variances = np.ones((num_boxes, 4)) * self.variances[0]
- elif len(self.variances) == 4:
- variances = np.tile(self.variances, (num_boxes, 1))
- else:
- raise Exception(\'Must provide one or four variances.\')
- ##把variance加入到输出之中。
- prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
- prior_boxes_tensor = K.expand_dims(K.variable(prior_boxes), 0)
- if K.backend() == \'tensorflow\':
- pattern = [tf.shape(x)[0], 1, 1]
- prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern)
- return prior_boxes_tensor
综合上面对model的分析,最后预测输出的shape为:[batch_size, num_box, location+num_class+8]
整体的架构完事之后,就需要准备好数据和loss function了,先看看如何预处理数据吧。
model的数据准备:
代码中编写了一个处理VOC数据集的py文件:
- import numpy as np
- import os
- from xml.etree import ElementTree
-
- class XML_preprocessor(object):
- #输出为:{image_name: [num_image, num_object_per_image, location+num_class]}
- def __init__(self, data_path):
- self.path_prefix = data_path
- self.num_classes = 20
- self.data = dict()
- self._preprocess_XML()
-
- def _preprocess_XML(self):
- filenames = os.listdir(self.path_prefix)
- for filename in filenames:
- tree = ElementTree.parse(self.path_prefix + filename)
- root = tree.getroot()
- bounding_boxes = []
- one_hot_classes = []
- size_tree = root.find(\'size\')
- width = float(size_tree.find(\'width\').text)
- height = float(size_tree.find(\'height\').text)
- for object_tree in root.findall(\'object\'):
- for bounding_box in object_tree.iter(\'bndbox\'):
- xmin = float(bounding_box.find(\'xmin\').text)/width
- ymin = float(bounding_box.find(\'ymin\').text)/height
- xmax = float(bounding_box.find(\'xmax\').text)/width
- ymax = float(bounding_box.find(\'ymax\').text)/height
- bounding_box = [xmin,ymin,xmax,ymax]
- bounding_boxes.append(bounding_box)
- class_name = object_tree.find(\'name\').text
- one_hot_class = self._to_one_hot(class_name)
- one_hot_classes.append(one_hot_class)
- image_name = root.find(\'filename\').text
- bounding_boxes = np.asarray(bounding_boxes)
- one_hot_classes = np.asarray(one_hot_classes)
- image_data = np.hstack((bounding_boxes, one_hot_classes))
- self.data[image_name] = image_data
-
- def _to_one_hot(self,name):
- one_hot_vector = [0] * self.num_classes
- if name == \'aeroplane\':
- one_hot_vector[0] = 1
- elif name == \'bicycle\':
- one_hot_vector[1] = 1
- elif name == \'bird\':
- one_hot_vector[2] = 1
- elif name == \'boat\':
- one_hot_vector[3] = 1
- elif name == \'bottle\':
- one_hot_vector[4] = 1
- elif name == \'bus\':
- one_hot_vector[5] = 1
- elif name == \'car\':
- one_hot_vector[6] = 1
- elif name == \'cat\':
- one_hot_vector[7] = 1
- elif name == \'chair\':
- one_hot_vector[8] = 1
- elif name == \'cow\':
- one_hot_vector[9] = 1
- elif name == \'diningtable\':
- one_hot_vector[10] = 1
- elif name == \'dog\':
- one_hot_vector[11] = 1
- elif name == \'horse\':
- one_hot_vector[12] = 1
- elif name == \'motorbike\':
- one_hot_vector[13] = 1
- elif name == \'person\':
- one_hot_vector[14] = 1
- elif name == \'pottedplant\':
- one_hot_vector[15] = 1
- elif name == \'sheep\':
- one_hot_vector[16] = 1
- elif name == \'sofa\':
- one_hot_vector[17] = 1
- elif name == \'train\':
- one_hot_vector[18] = 1
- elif name == \'tvmonitor\':
- one_hot_vector[19] = 1
- else:
- print(\'unknown label: %s\' %name)
- return one_hot_vector
- ## 写入到pkl文件中。
- import pickle
- data = XML_preprocessor(\'VOC2007/Annotations/\').data
- pickle.dump(data,open(\'VOC2007.p\',\'wb\'))
把标注写入到pkl文件中后,再利用定义一个Generator类来产生x_batch和 y_batch用于训练,直接看重点,类中的generate函数:
- def generate(self, train=True):
- while True:
- if train:
- shuffle(self.train_keys)
- keys = self.train_keys
- else:
- shuffle(self.val_keys)
- keys = self.val_keys
- inputs = []
- targets = []
- for key in keys:
- img_path = self.path_prefix + key
- img = imread(img_path).astype(\'float32\')
- y = self.gt[key].copy()#从pkl文件读取而来的groud truth
- ##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
- if train and self.do_crop:
- img, y = self.random_sized_crop(img, y)
- img = imresize(img, self.image_size).astype(\'float32\')
- if train:#进行数据扩充
- shuffle(self.color_jitter)
- for jitter in self.color_jitter:
- img = jitter(img)
- if self.lighting_std:
- img = self.lighting(img)
- if self.hflip_prob > 0:
- img, y = self.horizontal_flip(img, y)
- if self.vflip_prob > 0:
- img, y = self.vertical_flip(img, y)
- y = self.bbox_util.assign_boxes(y) #给groud truth 分配 default box
- inputs.append(img)
- targets.append(y)
- if len(targets) == self.batch_size:
- tmp_inp = np.array(inputs)
- tmp_targets = np.array(targets)
- inputs = []
- targets = []
- yield preprocess_input(tmp_inp), tmp_targets#产生一个batch的输入数据,及其标准的输出label。
在给groud truth 分配 default box 时用到了BBoxUtility类中的assign_boxes函数,这个类是写在ssd_utils.py文件中的,其中的assign_boxes函数的代码如下:
- #用于给label分配高分的default box
- def assign_boxes(self, boxes):
- #变量: boxes: Box,它的shape为:(num_boxes, 4 + num_classes),其中num_classes没有包括背景
- #返回值: assignment:它的shape为: (num_boxes, 4 + num_classes + 8),
- #第二维上的8其实很多都是0,只有在assignment[:, -8]存在1,代表给default box分配了哪个groud truth
- assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8))
- assignment[:, 4] = 1.0
- if len(boxes) == 0:
- return assignment
- encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
- encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
- #找出一张图中的所有的object与所有的prior box的最大IOU,即每个prior box对应一个object
- best_iou = encoded_boxes[:, :, -1].max(axis=0)
- ##找出每个prior box对应的那个object的索引。len(best_iou_idx)=num_priors
- best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
- ##找出与groud truth 存在IOU的prior box
- best_iou_mask = best_iou > 0
- best_iou_idx = best_iou_idx[best_iou_mask]
- assign_num = len(best_iou_idx)
- ##筛选出与groud truth 有IOU的prior box
- encoded_boxes = encoded_boxes[:, best_iou_mask, :]
- #确定给assignment分配中的prior box分配 具体哪一个groud truth。best_iou_idx中元素的范围为:range(num_object)。
- assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num),:4]
- assignment[:, 4][best_iou_mask] = 0
- assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
- assignment[:, -8][best_iou_mask] = 1
- return assignment
-
返回了最终的assignment,用于作为训练时候的标准输出。
值得注意的是,在这个类里面用到self.prior,即default box都是作者先写入到了pkl文件中的,方便于使用,而且对于特定大小的feature map而言,default box是保持不变的,所以提前给出是不会影响训练的。
输入的数据和标准的输出都知道了,接下来就是定义loss function 了
model 的 loss function:
model 的loss function定义在了ssd_training.py文件中了,里面定义了一些有用的功能函数,来帮助最终loss计算的,我们就直接看最终计算那个loss的函数:
- def compute_loss(self, y_true, y_pred):
- # 在keras中自定义loss函数,它的两个输入必须为预测的输出和标准的输出
- # 变量:
- # y_pred: 它的shape为: (?, num_boxes, 4 + num_classes + 8). 就是在model框架部分介绍的输出。
- # y_truth:它的shape和y_pred的shape是一样的,就是上一节我们介绍assignment那一块的输出,具体参考上一节。
- # 返回最终的所有loss总和
- batch_size = tf.shape(y_true)[0]
- num_boxes = tf.to_float(tf.shape(y_true)[1])
- # 计算出所有default box的loss
- conf_loss = self._softmax_loss(y_true[:, :, 4:-8],
- y_pred[:, :, 4:-8])
- loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
- y_pred[:, :, :4])
- #计算positive 样本的loss
- #num_pos 为一个一维的array:len(num_pos)=batch
- num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)
- ##只需计算存在gt_box与其对应的loss
- pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8],
- axis=1)
- pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8],
- axis=1)
- #计算negative sample的loss,只计算了confidence loss
- num_neg = tf.minimum(self.neg_pos_ratio * num_pos,
- num_boxes - num_pos)
- pos_num_neg_mask = tf.greater(num_neg, 0)
- has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
- num_neg = tf.concat(axis=0, values=[num_neg,
- [(1 - has_min) * self.negatives_for_hard]])
- #tf.boolen_mask(a,b),例如b=[true, false],a=[[[2,2],[2,3]]],则输出为[2,2]。
- #实际上就是取num_neg为正数的那些元素,然后再在其中取num_neg中的最小的元素作为num_neg_batch。
- num_neg_batch = tf.reduce_min(tf.boolean_mask(num_neg,
- tf.greater(num_neg, 0)))
- num_neg_batch = tf.to_int32(num_neg_batch)
- confs_start = 4 + self.background_label_id + 1
- confs_end = confs_start + self.num_classes - 1
- #max_confs的shape为:(batch, num_prior)
- max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
- axis=2)
- #返回负样本的top-K个元素,最终返回的indices的shape为(batch, K=num_neg_batch)
- _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
- k=num_neg_batch)
- #创建一个shape也为(batch,num_neg_batch)的indices
- batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
- batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
- #乘以num_boxes后得到batch中每一个sample的index的起始值,再加上top_k得到的index就得到了一个一维的full_indices。
- full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
- tf.reshape(indices, [-1]))
- #把得到的conf_loss也reshape成一维,然后用full_indices对其进行取值
- neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
- full_indices)
- #最终把负样本的confidence loss reshape 成(batch, num_neg_batch),再对每个sample上的loss求和。
- neg_conf_loss = tf.reshape(neg_conf_loss,
- [batch_size, num_neg_batch])
- neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)
- #整合所有的loss:positive loss 和 negative loss
- total_loss = pos_conf_loss + neg_conf_loss
- total_loss /= (num_pos + tf.to_float(num_neg_batch))
- num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,
- tf.ones_like(num_pos))
- total_loss += (self.alpha * pos_loc_loss) / num_pos
- return total_loss
这时候function loss 也准备好了,属于一切都准备就绪了。当然就是进行训练了。其实在写这篇blog之前我还是对loss function 这块没有太细看明白,写完之后顿时就恍然大悟的,写blog确实是一个自我学习的一个很好过程。
model 进行 training
training这一块是写在SSD_training.ipynb的jupyter notebook文件中的,上面那些model 的部件准备好了之后,training就按照keras的流程照搬就好了。
不过需要注意一下,作者给的这个训练并不是voc数据集的训练,而是对3种瓶子的检测。
1.必要的库和自己编写的模块的导入:
- import cv2
- import keras
- from keras.applications.imagenet_utils import preprocess_input
- from keras.backend.tensorflow_backend import set_session
- from keras.models import Model
- from keras.preprocessing import image
- import matplotlib.pyplot as plt
- import numpy as np
- import pickle
- from random import shuffle
- from scipy.misc import imread
- from scipy.misc import imresize
- import tensorflow as tf
- from ssd import SSD300
- from ssd_training import MultiboxLoss
- from ssd_utils import BBoxUtility
-
- %matplotlib inline
- plt.rcParams[\'figure.figsize\'] = (8, 8)
- plt.rcParams[\'image.interpolation\'] = \'nearest\'
-
- np.set_printoptions(suppress=True)
2.必要的初始化参数和prior box 的读取,以及输入数据的读取:
- NUM_CLASSES = 4
- input_shape = (300, 300, 3)
- #prior_boxes_ssd300.pkl 存放了所有的prior:[xmin, ymin, xmax, ymax,var[0],var[1],var[2],var[3]]
- priors = pickle.load(open(\'prior_boxes_ssd300.pkl\', \'rb\'))
- bbox_util = BBoxUtility(NUM_CLASSES, priors)
- #获得输入数据的file_name、bounding box 和 label
- gt = pickle.load(open(\'gt_pascal.pkl\', \'rb\'))
- keys = sorted(gt.keys())
- num_train = int(round(0.8 * len(keys)))
- train_keys = keys[:num_train]
- val_keys = keys[num_train:]
- num_val = len(val_keys)
3.输入数据和label的generator类定义,有点长,就把generate 那个函数贴出来:
- class Generator(object):
- def generate(self, train=True):
- while True:
- if train:
- shuffle(self.train_keys)
- keys = self.train_keys
- else:
- shuffle(self.val_keys)
- keys = self.val_keys
- inputs = []
- targets = []
- for key in keys:
- img_path = self.path_prefix + key
- img = imread(img_path).astype(\'float32\')
- y = self.gt[key].copy()
- ##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
- if train and self.do_crop:
- img, y = self.random_sized_crop(img, y)
- img = imresize(img, self.image_size).astype(\'float32\')
- if train:
- shuffle(self.color_jitter)
- for jitter in self.color_jitter:
- img = jitter(img)
- if self.lighting_std:
- img = self.lighting(img)
- if self.hflip_prob > 0:
- img, y = self.horizontal_flip(img, y)
- if self.vflip_prob > 0:
- img, y = self.vertical_flip(img, y)
- y = self.bbox_util.assign_boxes(y)
- inputs.append(img)
- targets.append(y)
- if len(targets) == self.batch_size:
- tmp_inp = np.array(inputs)
- tmp_targets = np.array(targets)
- inputs = []
- targets = []
- yield preprocess_input(tmp_inp), tmp_targets #batch 生成器
4.必要的初始化
- #输入数据(图片)的root directory
- path_prefix = \'../../frames/\'
- gen = Generator(gt, bbox_util, 16, \'../../frames/\',
- train_keys, val_keys,
- (input_shape[0], input_shape[1]), do_crop=False)
- #构建SSD300的model
- model = SSD300(input_shape, num_classes=NUM_CLASSES)
- model.load_weights(\'weights_SSD300.hdf5\', by_name=True)
- #也没太弄懂,为什么需要把他们给freeze,为啥也对他们train
- freeze = [\'input_1\', \'conv1_1\', \'conv1_2\', \'pool1\',
- \'conv2_1\', \'conv2_2\', \'pool2\',
- \'conv3_1\', \'conv3_2\', \'conv3_3\', \'pool3\']
- for L in model.layers:
- if L.name in freeze:
- L.trainable = False
5.keras的一些callback function的定义以及model的compile and training:
- def schedule(epoch, decay=0.9):
- return base_lr * decay**(epoch)
-
- callbacks = [keras.callbacks.ModelCheckpoint(\'./checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5\',
- verbose=1,
- save_weights_only=True),
- keras.callbacks.LearningRateScheduler(schedule)]
- base_lr = 3e-4
- optim = keras.optimizers.Adam(lr=base_lr)
- # optim = keras.optimizers.RMSprop(lr=base_lr)
- # optim = keras.optimizers.SGD(lr=base_lr, momentum=0.9, decay=decay, nesterov=True)
- model.compile(optimizer=optim,
- loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss)
- nb_epoch = 30
- history = model.fit_generator(gen.generate(True), gen.train_batches,
- nb_epoch, verbose=1,
- callbacks=callbacks,
- validation_data=gen.generate(False),
- nb_val_samples=gen.val_batches,
- nb_worker=1)
6.train完了之后,当然是检测了:
- #数据的读取
- inputs = []
- images = []
- img_path = path_prefix + sorted(val_keys)[0]
- img = image.load_img(img_path, target_size=(300, 300))
- img = image.img_to_array(img)
- images.append(imread(img_path))
- inputs.append(img.copy())
- inputs = preprocess_input(np.array(inputs))
- #进行预测和预测后对预测结果的解码
- preds = model.predict(inputs, batch_size=1, verbose=1)
- results = bbox_util.detection_out(preds)
- #可视化预测结果
- for i, img in enumerate(images):
- # Parse the outputs.
- det_label = results[i][:, 0]
- det_conf = results[i][:, 1]
- det_xmin = results[i][:, 2]
- det_ymin = results[i][:, 3]
- det_xmax = results[i][:, 4]
- det_ymax = results[i][:, 5]
- # Get detections with confidence higher than 0.6.
- top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]
- top_conf = det_conf[top_indices]
- top_label_indices = det_label[top_indices].tolist()
- top_xmin = det_xmin[top_indices]
- top_ymin = det_ymin[top_indices]
- top_xmax = det_xmax[top_indices]
- top_ymax = det_ymax[top_indices]
-
- colors = plt.cm.hsv(np.linspace(0, 1, 4)).tolist()
-
- plt.imshow(img / 255.)
- currentAxis = plt.gca()
-
- for i in range(top_conf.shape[0]):
- xmin = int(round(top_xmin[i] * img.shape[1]))
- ymin = int(round(top_ymin[i] * img.shape[0]))
- xmax = int(round(top_xmax[i] * img.shape[1]))
- ymax = int(round(top_ymax[i] * img.shape[0]))
- score = top_conf[i]
- label = int(top_label_indices[i])
- #注意这里的label直接使用的数字,因为它train的数据集不是voc,而是几种瓶子的种类。
- display_txt = \'{:0.2f}, {}\'.format(score, label)
- coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
- color = colors[label]
- currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
- currentAxis.text(xmin, ymin, display_txt, bbox={\'facecolor\':color, \'alpha\':0.5})
- plt.show()
7.predict 的结果:
整个过程也就基本上的结束了。SSD的keras实现还是比较简单的,没有mask r-cnn那么费劲。不知道为啥我先看的yolo的原理和实现,但是不太想写yolo的实现和原理(手动白眼),直接跳到了SSD,大概是觉得SSD比较好理解把,yolo等有时间再写吧。
之后我再把生成prior box pkl文件的代码贴上来,自己写的代码有点乱。希望看到了最后你对SDD的模型架构和具体实现都有了一个很好的认识。因为也是一个新手,所以其中有什么理解不到位,或者写错的,欢迎指出。
添加:prior box 的 pkl文件生成代码:其实也很简单,就是稍微修改了一下PriorBox这个自定义的keras layer,把输出用来产生对于特定feature map 大小的 default box:
- import numpy as np
- class PriorBox():
- def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
- flip=True, variances=[0.1,0.1,0.2,0.2], clip=True, layer_shape=[8,8],**kwargs):
- self.input_shape = layer_shape
- self.img_size = img_size
- if min_size <= 0:
- raise Exception(\'min_size must be positive.\')
- self.min_size = min_size
- self.max_size = max_size
- self.aspect_ratios = [1.0]
- if max_size:
- if max_size < min_size:
- raise Exception(\'max_size must be greater than min_size.\')
- self.aspect_ratios.append(1.0)
- if aspect_ratios:
- for ar in aspect_ratios:
- if ar in self.aspect_ratios:
- continue
- self.aspect_ratios.append(ar)
- if flip:
- self.aspect_ratios.append(1.0 / ar)
- self.variances = np.array(variances)
- self.clip = True
- super(PriorBox, self).__init__(**kwargs)
-
- def compute_default_box(self):
- layer_height = self.input_shape[0]
- layer_width = self.input_shape[1]
- img_width = self.img_size[0]
- img_height = self.img_size[1]
- # define prior boxes shapes
- box_widths = []
- box_heights = []
- for ar in self.aspect_ratios:
- if ar == 1 and len(box_widths) == 0:
- box_widths.append(self.min_size)
- box_heights.append(self.min_size)
- elif ar == 1 and len(box_widths) > 0:
- box_widths.append(np.sqrt(self.min_size * self.max_size))
- box_heights.append(np.sqrt(self.min_size * self.max_size))
- elif ar != 1:
- box_widths.append(self.min_size * np.sqrt(ar))
- box_heights.append(self.min_size / np.sqrt(ar))
- box_widths = 0.5 * np.array(box_widths)
- box_heights = 0.5 * np.array(box_heights)
- # define centers of prior boxes
- step_x = img_width / layer_width
- step_y = img_height / layer_height
- #generate a list data
- linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
- layer_width)
- liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
- layer_height)
- ##ulitize meshgrid function to generate default box\'s coordinates
- centers_x, centers_y = np.meshgrid(linx, liny)
- centers_x = centers_x.reshape(-1, 1)
- centers_y = centers_y.reshape(-1, 1)
- # define xmin, ymin, xmax, ymax of prior boxes
- num_priors_ = len(self.aspect_ratios)
- prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
- prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
- prior_boxes[:, ::4] -= box_widths
- prior_boxes[:, 1::4] -= box_heights
- prior_boxes[:, 2::4] += box_widths
- prior_boxes[:, 3::4] += box_heights
- prior_boxes[:, ::2] /= img_width
- prior_boxes[:, 1::2] /= img_height
- prior_boxes = prior_boxes.reshape(-1, 4)
- if self.clip:
- prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
- # define variances
- num_boxes = len(prior_boxes)
- if len(self.variances) == 1:
- variances = np.ones((num_boxes, 4)) * self.variances[0]
- elif len(self.variances) == 4:
- variances = np.tile(self.variances, (num_boxes, 1))
- else:
- raise Exception(\'Must provide one or four variances.\')
- prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
- return prior_boxes
-
- #调用修改后的PriorBox类
- img_size = (300, 300)
- default_box_layer1 = PriorBox(img_size, 30, [], aspect_ratios=[2], layer_shape=(38,38)).compute_default_box()
- default_box_layer2 = PriorBox(img_size, 60, 114, aspect_ratios=[2,3], layer_shape=(19,19)).compute_default_box()
- default_box_layer3 = PriorBox(img_size, 114, 168, aspect_ratios=[2,3], layer_shape=(10,10)).compute_default_box()
- default_box_layer4 = PriorBox(img_size, 168, 222, aspect_ratios=[2,3], layer_shape=(5,5)).compute_default_box()
- default_box_layer5 = PriorBox(img_size, 222, 276, aspect_ratios=[2,3], layer_shape=(3,3)).compute_default_box()
- default_box_layer6 = PriorBox(img_size, 276, 330, aspect_ratios=[2,3], layer_shape=(1,1)).compute_default_box()
- #把各层的输出concatenate起来
- default_box = np.concatenate((default_box_layer1, default_box_layer2, default_box_layer3,\
- default_box_layer4, default_box_layer5, default_box_layer6), axis=0)
- #写入到pkl文件中
- import pickle
- pickle.dump(default_box,open("default_box_information","wb"))