SSD算法详解 及其 keras实现

时间:2024-03-03 21:57:33

在上一篇的博客讲述了SSD的原理,这一篇主要是讲解keras的实现。

keras代码的github地址为:点击打开链接    

    model 的框架实现(ssd.py):

   先给出了改变后的VGG16的实现:

  1. def SSD300(input_shape, num_classes=21):
  2. #Input_shape 为输入的形状(300,300,3)
  3. #num_class 为需要检测的种类。
  4.  # Block 1
  5. input_tensor = input_tensor = Input(shape=input_shape)
  6. img_size = (input_shape[1], input_shape[0])
  7. net[\'input\'] = input_tensor
  8. net[\'conv1_1\'] = Convolution2D(64, 3, 3,
  9. activation=\'relu\',
  10. border_mode=\'same\',
  11. name=\'conv1_1\')(net[\'input\'])
  12. net[\'conv1_2\'] = Convolution2D(64, 3, 3,
  13. activation=\'relu\',
  14. border_mode=\'same\',
  15. name=\'conv1_2\')(net[\'conv1_1\'])
  16. net[\'pool1\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
  17. name=\'pool1\')(net[\'conv1_2\'])
  18. # Block 2
  19. net[\'conv2_1\'] = Convolution2D(128, 3, 3,
  20. activation=\'relu\',
  21. border_mode=\'same\',
  22. name=\'conv2_1\')(net[\'pool1\'])
  23. net[\'conv2_2\'] = Convolution2D(128, 3, 3,
  24. activation=\'relu\',
  25. border_mode=\'same\',
  26. name=\'conv2_2\')(net[\'conv2_1\'])
  27. net[\'pool2\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
  28. name=\'pool2\')(net[\'conv2_2\'])
  29. # Block 3
  30. net[\'conv3_1\'] = Convolution2D(256, 3, 3,
  31. activation=\'relu\',
  32. border_mode=\'same\',
  33. name=\'conv3_1\')(net[\'pool2\'])
  34. net[\'conv3_2\'] = Convolution2D(256, 3, 3,
  35. activation=\'relu\',
  36. border_mode=\'same\',
  37. name=\'conv3_2\')(net[\'conv3_1\'])
  38. net[\'conv3_3\'] = Convolution2D(256, 3, 3,
  39. activation=\'relu\',
  40. border_mode=\'same\',
  41. name=\'conv3_3\')(net[\'conv3_2\'])
  42. net[\'pool3\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
  43. name=\'pool3\')(net[\'conv3_3\'])
  44. # Block 4
  45. net[\'conv4_1\'] = Convolution2D(512, 3, 3,
  46. activation=\'relu\',
  47. border_mode=\'same\',
  48. name=\'conv4_1\')(net[\'pool3\'])
  49. net[\'conv4_2\'] = Convolution2D(512, 3, 3,
  50. activation=\'relu\',
  51. border_mode=\'same\',
  52. name=\'conv4_2\')(net[\'conv4_1\'])
  53. net[\'conv4_3\'] = Convolution2D(512, 3, 3,
  54. activation=\'relu\',
  55. border_mode=\'same\',
  56. name=\'conv4_3\')(net[\'conv4_2\'])
  57. net[\'pool4\'] = MaxPooling2D((2, 2), strides=(2, 2), border_mode=\'same\',
  58. name=\'pool4\')(net[\'conv4_3\'])
  59. # Block 5
  60. net[\'conv5_1\'] = Convolution2D(512, 3, 3,
  61. activation=\'relu\',
  62. border_mode=\'same\',
  63. name=\'conv5_1\')(net[\'pool4\'])
  64. net[\'conv5_2\'] = Convolution2D(512, 3, 3,
  65. activation=\'relu\',
  66. border_mode=\'same\',
  67. name=\'conv5_2\')(net[\'conv5_1\'])
  68. net[\'conv5_3\'] = Convolution2D(512, 3, 3,
  69. activation=\'relu\',
  70. border_mode=\'same\',
  71. name=\'conv5_3\')(net[\'conv5_2\'])
  72. net[\'pool5\'] = MaxPooling2D((3, 3), strides=(1, 1), border_mode=\'same\',
  73. name=\'pool5\')(net[\'conv5_3\'])
  74. # FC6
  75. net[\'fc6\'] = AtrousConvolution2D(1024, 3, 3, atrous_rate=(6, 6),
  76. activation=\'relu\', border_mode=\'same\',
  77. name=\'fc6\')(net[\'pool5\'])
  78. # FC7
  79. net[\'fc7\'] = Convolution2D(1024, 1, 1, activation=\'relu\',
  80. border_mode=\'same\', name=\'fc7\')(net[\'fc6\'])
  81. # Block 6
  82. net[\'conv6_1\'] = Convolution2D(256, 1, 1, activation=\'relu\',
  83. border_mode=\'same\',
  84. name=\'conv6_1\')(net[\'fc7\'])
  85. net[\'conv6_2\'] = Convolution2D(512, 3, 3, subsample=(2, 2),
  86. activation=\'relu\', border_mode=\'same\',
  87. name=\'conv6_2\')(net[\'conv6_1\'])
  88. # Block 7
  89. net[\'conv7_1\'] = Convolution2D(128, 1, 1, activation=\'relu\',
  90. border_mode=\'same\',
  91. name=\'conv7_1\')(net[\'conv6_2\'])
  92. net[\'conv7_2\'] = ZeroPadding2D()(net[\'conv7_1\'])
  93. net[\'conv7_2\'] = Convolution2D(256, 3, 3, subsample=(2, 2),
  94. activation=\'relu\', border_mode=\'valid\',
  95. name=\'conv7_2\')(net[\'conv7_2\'])
  96. # Block 8
  97. net[\'conv8_1\'] = Convolution2D(128, 1, 1, activation=\'relu\',
  98. border_mode=\'same\',
  99. name=\'conv8_1\')(net[\'conv7_2\'])
  100. net[\'conv8_2\'] = Convolution2D(256, 3, 3, subsample=(2, 2),
  101. activation=\'relu\', border_mode=\'same\',
  102. name=\'conv8_2\')(net[\'conv8_1\'])
  103. # Last Pool
  104. net[\'pool6\'] = GlobalAveragePooling2D(name=\'pool6\')(net[\'conv8_2\'])

    标红部分就是进行改变的部分,可以看出把FC6换成了空洞卷积,和普通卷积差不多,就是把一次卷积的感受域扩大了。FC7换成了普通卷积,之后再添加了几个卷积块。


接下来就是通过改变后的VGG16得到的多层feature map来预测location 和 confidence。使用到的feature map 有:conv4_3、fc7、conv6_2、conv7_2、conv8_2、pool6。总共6层的feature map。因为对于每层的处理步骤差不多,所以就贴出conv4_3处理的代码:

  1. # Prediction from conv4_3
  2.     net[\'conv4_3_norm\'] = Normalize(20, name=\'conv4_3_norm\')(net[\'conv4_3\'])
  3.     num_priors = 3
  4.     x = Convolution2D(num_priors * 4, 3, 3, border_mode=\'same\',
  5.                       name=\'conv4_3_norm_mbox_loc\')(net[\'conv4_3_norm\'])
  6.     net[\'conv4_3_norm_mbox_loc\'] = x
  7.     flatten = Flatten(name=\'conv4_3_norm_mbox_loc_flat\')
  8.     net[\'conv4_3_norm_mbox_loc_flat\'] = flatten(net[\'conv4_3_norm_mbox_loc\'])
  9.     name = \'conv4_3_norm_mbox_conf\'
  10.     if num_classes != 21:
  11.         name += \'_{}\'.format(num_classes)
  12.     x = Convolution2D(num_priors * num_classes, 3, 3, border_mode=\'same\',
  13.                       name=name)(net[\'conv4_3_norm\'])
  14.     net[\'conv4_3_norm_mbox_conf\'] = x
  15.     flatten = Flatten(name=\'conv4_3_norm_mbox_conf_flat\')
  16.     net[\'conv4_3_norm_mbox_conf_flat\'] = flatten(net[\'conv4_3_norm_mbox_conf\'])
  17.     priorbox = PriorBox(img_size, 30.0, aspect_ratios=[2],
  18.                         variances=[0.1, 0.1, 0.2, 0.2],
  19.                         name=\'conv4_3_norm_mbox_priorbox\')
  20.     net[\'conv4_3_norm_mbox_priorbox\'] = priorbox(net[\'conv4_3_norm\'])

可以看出对于conv4_3这层的feature map,采用的default box 的个数为3。所以location预测这个卷积层使用的卷积核个数为:3*4=12个。卷积完之后进行flatten,因为最后的输出是多层feature map预测的concatenate。同理,对于confidence预测采用的卷积核个数为:21*3=36(对于voc数据集而言)。对于PriorBox这一层,目前只需要知道它是对feature map 进行相应的操作,来得到default box的,而且对于特定的一层feature map而言,它是固定不变的,不随train或者predict的过程改变的。

对于pool6产生的feature map处理有一些不一样,这里单独的拿出来说一下,因为pool6层使用的是globa laverage pool,所以它输出的大小为1*1*256,比较小,不太适合用卷积处理了,就直接用Dense层来处理了:

  1. # Prediction from pool6
  2.     num_priors = 6
  3.     x = Dense(num_priors * 4, name=\'pool6_mbox_loc_flat\')(net[\'pool6\'])
  4.     net[\'pool6_mbox_loc_flat\'] = x
  5.     name = \'pool6_mbox_conf_flat\'
  6.     if num_classes != 21:
  7.         name += \'_{}\'.format(num_classes)
  8.     x = Dense(num_priors * num_classes, name=name)(net[\'pool6\'])
  9.     net[\'pool6_mbox_conf_flat\'] = x
  10.     priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3],
  11.                         variances=[0.1, 0.1, 0.2, 0.2],
  12.                         name=\'pool6_mbox_priorbox\')
  13.     if K.image_dim_ordering() == \'tf\':
  14.         target_shape = (1, 1, 256)
  15.     else:
  16.         target_shape = (256, 1, 1)
  17.     net[\'pool6_reshaped\'] = Reshape(target_shape,
  18.                                     name=\'pool6_reshaped\')(net[\'pool6\'])
  19.     net[\'pool6_mbox_priorbox\'] = priorbox(net[\'pool6_reshaped\'])


每层预测完事之后呢,当然是把他们都给concatenate起来,就贴location的实现,其他两个类似:

  1. net[\'mbox_loc\'] = merge([net[\'conv4_3_norm_mbox_loc_flat\'],
  2. net[\'fc7_mbox_loc_flat\'],
  3. net[\'conv6_2_mbox_loc_flat\'],
  4. net[\'conv7_2_mbox_loc_flat\'],
  5. net[\'conv8_2_mbox_loc_flat\'],
  6. net[\'pool6_mbox_loc_flat\']],
  7. mode=\'concat\', concat_axis=1, name=\'mbox_loc\')

因为之前进行了flatten,所以concatenate得到的是一个batch中每个sample所有的location位置,并且是一个一维的形式存在,需要把它给重新reshape成[batch, number of default box, 4 ]的形式;预测的class分类也是类似的:[batch, number of default box, 21 ]。最后再将location、class、default box三者进行merge得到最终的预测结果。

  1.     #计算default box 的个数
  2.    if hasattr(net[\'mbox_loc\'], \'_keras_shape\'):
  3. num_boxes = net[\'mbox_loc\']._keras_shape[-1] // 4
  4. elif hasattr(net[\'mbox_loc\'], \'int_shape\'):
  5. num_boxes = K.int_shape(net[\'mbox_loc\'])[-1] // 4
  6. net[\'mbox_loc\'] = Reshape((num_boxes, 4),
  7. name=\'mbox_loc_final\')(net[\'mbox_loc\'])
  8. net[\'mbox_conf\'] = Reshape((num_boxes, num_classes),
  9. name=\'mbox_conf_logits\')(net[\'mbox_conf\'])
  10. net[\'mbox_conf\'] = Activation(\'softmax\',
  11. name=\'mbox_conf_final\')(net[\'mbox_conf\'])
  12. net[\'predictions\'] = merge([net[\'mbox_loc\'],
  13. net[\'mbox_conf\'],
  14. net[\'mbox_priorbox\']],
  15. mode=\'concat\', concat_axis=2,
  16. name=\'predictions\')

我们来计算一下这六层feature map总共拥有的default box的数量:38*38*3+19*19*6+10*10*6+5*5*6+3*3*6+1*1*6=7308。和论文中还是存在一定的差别的。

接一下就是介绍一下model中使用到的PriorBox层的作用。它是作用在每一层的feature map上的,根据输入的不同aspect ratio 和 scale 以及 num_prior来返回特定的default box,default box 的数目是feature map的height*width*num_prior。具体看代码:

  1. class PriorBox(Layer):
  2.        \'\'\'
  3.         img_size: 输入图片的大小(w, h).
  4. min_size: 每个feature cell中最小的scale,不是归一化后的值,而是实际的大小
  5. max_size: 每个feature cell中最大的scale,不是归一化的值,而是实际的大小
  6. aspect_ratios: 长宽比
  7. flip:是否需要对长宽比进行反转。
  8. variances: 添加的方差x,y,w,h
  9. clip: 让输出保持在[0,1之间
  10. 输入的shape:
  11. `4D的tensor:(samples, rows, cols, channels)
  12. 输出的shape:
  13. 3D的tensor:(samples, num_boxes, 8)
  14.         其中的8具体为:(xmin, ymin, xmax, ymax, variance[0], variance[1], variance[2], variance[3])
  15. """
  16. def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
  17. flip=True, variances=[0.1], clip=True, **kwargs):
  18.   self.waxis = 2
  19. self.haxis = 1
  20. self.img_size = img_size
  21. if min_size <= 0:
  22. raise Exception(\'min_size must be positive.\')
  23. self.min_size = min_size
  24. self.max_size = max_size
  25. self.aspect_ratios = [1.0]
  26. if max_size:
  27. if max_size < min_size:
  28. raise Exception(\'max_size must be greater than min_size.\')
  29. self.aspect_ratios.append(1.0)
  30.         #根据给定的aspect_ratio来计算全部的aspect ratio
  31.         if aspect_ratios:
  32. for ar in aspect_ratios:
  33. if ar in self.aspect_ratios:
  34. continue
  35. self.aspect_ratios.append(ar)
  36. if flip:
  37. self.aspect_ratios.append(1.0 / ar)
  38. self.variances = np.array(variances)
  39. self.clip = True
  40. super(PriorBox, self).__init__(**kwargs)
  41.     #用于返回自定义层的输出shape
  42. def compute_output_shape(self, input_shape):
  43. num_priors_ = len(self.aspect_ratios)
  44. layer_width = input_shape[self.waxis]
  45. layer_height = input_shape[self.haxis]
  46. num_boxes = num_priors_ * layer_width * layer_height
  47. return (input_shape[0], num_boxes, 8)
  48. def call(self, x, mask=None):
  49. if hasattr(x, \'_keras_shape\'):
  50. input_shape = x._keras_shape
  51. elif hasattr(K, \'int_shape\'):
  52. input_shape = K.int_shape(x)
  53. layer_width = input_shape[self.waxis]
  54. layer_height = input_shape[self.haxis]
  55. img_width = self.img_size[0]
  56. img_height = self.img_size[1]
  57. # define prior boxes shapes
  58. box_widths = []
  59. box_heights = []
  60. for ar in self.aspect_ratios:
  61. if ar == 1 and len(box_widths) == 0:
  62. box_widths.append(self.min_size)
  63. box_heights.append(self.min_size)
  64. elif ar == 1 and len(box_widths) > 0:
  65. box_widths.append(np.sqrt(self.min_size * self.max_size))
  66. box_heights.append(np.sqrt(self.min_size * self.max_size))
  67. elif ar != 1:
  68. box_widths.append(self.min_size * np.sqrt(ar))
  69. box_heights.append(self.min_size / np.sqrt(ar))
  70. box_widths = 0.5 * np.array(box_widths)
  71. box_heights = 0.5 * np.array(box_heights)
  72. # define centers of prior boxes
  73. step_x = img_width / layer_width
  74. step_y = img_height / layer_height
  75. #用于产生default box的中心坐标
  76. linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
  77. layer_width)
  78. liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
  79. layer_height)
  80. centers_x, centers_y = np.meshgrid(linx, liny)
  81. centers_x = centers_x.reshape(-1, 1)
  82. centers_y = centers_y.reshape(-1, 1)
  83. # define xmin, ymin, xmax, ymax of prior boxes
  84. num_priors_ = len(self.aspect_ratios)
  85.         #concatenate之后得到了一连串的(centers_x,centers_y)形式的坐标
  86.         prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
  87.         #扩充得到(centers_x, centers_y, centers_x, centers_y)形式的坐标
  88.         prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
  89. prior_boxes[:, ::4] -= box_widths
  90. prior_boxes[:, 1::4] -= box_heights
  91. prior_boxes[:, 2::4] += box_widths
  92. prior_boxes[:, 3::4] += box_heights
  93. prior_boxes[:, ::2] /= img_width
  94. prior_boxes[:, 1::2] /= img_height
  95.         #最终得到各个default box的归一化后的(Xmin,Ymin, Xmax, Ymax)
  96.         #reshape成[num_box, 4]的形式
  97.         prior_boxes = prior_boxes.reshape(-1, 4)
  98. if self.clip:
  99. prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
  100. # define variances
  101. num_boxes = len(prior_boxes)
  102. if len(self.variances) == 1:
  103. variances = np.ones((num_boxes, 4)) * self.variances[0]
  104. elif len(self.variances) == 4:
  105. variances = np.tile(self.variances, (num_boxes, 1))
  106. else:
  107. raise Exception(\'Must provide one or four variances.\')
  108.         ##把variance加入到输出之中。
  109.         prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
  110. prior_boxes_tensor = K.expand_dims(K.variable(prior_boxes), 0)
  111. if K.backend() == \'tensorflow\':
  112. pattern = [tf.shape(x)[0], 1, 1]
  113. prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern)
  114. return prior_boxes_tensor

    综合上面对model的分析,最后预测输出的shape为:[batch_size,  num_box, location+num_class+8]

    整体的架构完事之后,就需要准备好数据和loss function了,先看看如何预处理数据吧。

    model的数据准备:

     代码中编写了一个处理VOC数据集的py文件:

  1. import numpy as np
  2. import os
  3. from xml.etree import ElementTree
  4. class XML_preprocessor(object):
  5. #输出为:{image_name: [num_image, num_object_per_image, location+num_class]}
  6. def __init__(self, data_path):
  7. self.path_prefix = data_path
  8. self.num_classes = 20
  9. self.data = dict()
  10. self._preprocess_XML()
  11. def _preprocess_XML(self):
  12. filenames = os.listdir(self.path_prefix)
  13. for filename in filenames:
  14. tree = ElementTree.parse(self.path_prefix + filename)
  15. root = tree.getroot()
  16. bounding_boxes = []
  17. one_hot_classes = []
  18. size_tree = root.find(\'size\')
  19. width = float(size_tree.find(\'width\').text)
  20. height = float(size_tree.find(\'height\').text)
  21. for object_tree in root.findall(\'object\'):
  22. for bounding_box in object_tree.iter(\'bndbox\'):
  23. xmin = float(bounding_box.find(\'xmin\').text)/width
  24. ymin = float(bounding_box.find(\'ymin\').text)/height
  25. xmax = float(bounding_box.find(\'xmax\').text)/width
  26. ymax = float(bounding_box.find(\'ymax\').text)/height
  27. bounding_box = [xmin,ymin,xmax,ymax]
  28. bounding_boxes.append(bounding_box)
  29. class_name = object_tree.find(\'name\').text
  30. one_hot_class = self._to_one_hot(class_name)
  31. one_hot_classes.append(one_hot_class)
  32. image_name = root.find(\'filename\').text
  33. bounding_boxes = np.asarray(bounding_boxes)
  34. one_hot_classes = np.asarray(one_hot_classes)
  35. image_data = np.hstack((bounding_boxes, one_hot_classes))
  36. self.data[image_name] = image_data
  37. def _to_one_hot(self,name):
  38. one_hot_vector = [0] * self.num_classes
  39. if name == \'aeroplane\':
  40. one_hot_vector[0] = 1
  41. elif name == \'bicycle\':
  42. one_hot_vector[1] = 1
  43. elif name == \'bird\':
  44. one_hot_vector[2] = 1
  45. elif name == \'boat\':
  46. one_hot_vector[3] = 1
  47. elif name == \'bottle\':
  48. one_hot_vector[4] = 1
  49. elif name == \'bus\':
  50. one_hot_vector[5] = 1
  51. elif name == \'car\':
  52. one_hot_vector[6] = 1
  53. elif name == \'cat\':
  54. one_hot_vector[7] = 1
  55. elif name == \'chair\':
  56. one_hot_vector[8] = 1
  57. elif name == \'cow\':
  58. one_hot_vector[9] = 1
  59. elif name == \'diningtable\':
  60. one_hot_vector[10] = 1
  61. elif name == \'dog\':
  62. one_hot_vector[11] = 1
  63. elif name == \'horse\':
  64. one_hot_vector[12] = 1
  65. elif name == \'motorbike\':
  66. one_hot_vector[13] = 1
  67. elif name == \'person\':
  68. one_hot_vector[14] = 1
  69. elif name == \'pottedplant\':
  70. one_hot_vector[15] = 1
  71. elif name == \'sheep\':
  72. one_hot_vector[16] = 1
  73. elif name == \'sofa\':
  74. one_hot_vector[17] = 1
  75. elif name == \'train\':
  76. one_hot_vector[18] = 1
  77. elif name == \'tvmonitor\':
  78. one_hot_vector[19] = 1
  79. else:
  80. print(\'unknown label: %s\' %name)
  81. return one_hot_vector
  82. ## 写入到pkl文件中。
  83. import pickle
  84. data = XML_preprocessor(\'VOC2007/Annotations/\').data
  85.  pickle.dump(data,open(\'VOC2007.p\',\'wb\'))

    把标注写入到pkl文件中后,再利用定义一个Generator类来产生x_batch和 y_batch用于训练,直接看重点,类中的generate函数:

  1. def generate(self, train=True):
  2. while True:
  3. if train:
  4. shuffle(self.train_keys)
  5. keys = self.train_keys
  6. else:
  7. shuffle(self.val_keys)
  8. keys = self.val_keys
  9. inputs = []
  10. targets = []
  11. for key in keys:
  12. img_path = self.path_prefix + key
  13. img = imread(img_path).astype(\'float32\')
  14. y = self.gt[key].copy()#从pkl文件读取而来的groud truth
  15. ##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
  16. if train and self.do_crop:
  17. img, y = self.random_sized_crop(img, y)
  18. img = imresize(img, self.image_size).astype(\'float32\')
  19. if train:#进行数据扩充
  20. shuffle(self.color_jitter)
  21. for jitter in self.color_jitter:
  22. img = jitter(img)
  23. if self.lighting_std:
  24. img = self.lighting(img)
  25. if self.hflip_prob > 0:
  26. img, y = self.horizontal_flip(img, y)
  27. if self.vflip_prob > 0:
  28. img, y = self.vertical_flip(img, y)
  29.  y = self.bbox_util.assign_boxes(y) #给groud truth 分配 default box
  30. inputs.append(img)
  31. targets.append(y)
  32. if len(targets) == self.batch_size:
  33. tmp_inp = np.array(inputs)
  34. tmp_targets = np.array(targets)
  35. inputs = []
  36. targets = []
  37. yield preprocess_input(tmp_inp), tmp_targets#产生一个batch的输入数据,及其标准的输出label。

在给groud truth 分配 default box 时用到了BBoxUtility类中的assign_boxes函数,这个类是写在ssd_utils.py文件中的,其中的assign_boxes函数的代码如下:

  1. #用于给label分配高分的default box
  2. def assign_boxes(self, boxes):
  3. #变量: boxes: Box,它的shape为:(num_boxes, 4 + num_classes),其中num_classes没有包括背景
  4. #返回值: assignment:它的shape为: (num_boxes, 4 + num_classes + 8),
  5. #第二维上的8其实很多都是0,只有在assignment[:, -8]存在1,代表给default box分配了哪个groud truth
  6.         assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8))
  7.         assignment[:, 4] = 1.0
  8.         if len(boxes) == 0:
  9.             return assignment
  10.         encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
  11.         encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
  12.         #找出一张图中的所有的object与所有的prior box的最大IOU,即每个prior box对应一个object
  13.         best_iou = encoded_boxes[:, :, -1].max(axis=0)
  14.         ##找出每个prior box对应的那个object的索引。len(best_iou_idx)=num_priors
  15.         best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
  16.         ##找出与groud truth 存在IOU的prior box
  17.         best_iou_mask = best_iou > 0
  18.         best_iou_idx = best_iou_idx[best_iou_mask]
  19.         assign_num = len(best_iou_idx)
  20.         ##筛选出与groud truth 有IOU的prior box
  21.         encoded_boxes = encoded_boxes[:, best_iou_mask, :]
  22.         #确定给assignment分配中的prior box分配 具体哪一个groud truth。best_iou_idx中元素的范围为:range(num_object)。
  23.         assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num),:4]
  24.         assignment[:, 4][best_iou_mask] = 0
  25.         assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
  26.         assignment[:, -8][best_iou_mask] = 1
  27.         return assignment

返回了最终的assignment,用于作为训练时候的标准输出。

值得注意的是,在这个类里面用到self.prior,即default box都是作者先写入到了pkl文件中的,方便于使用,而且对于特定大小的feature map而言,default box是保持不变的,所以提前给出是不会影响训练的。

输入的数据和标准的输出都知道了,接下来就是定义loss function 了

model 的 loss function:

model 的loss function定义在了ssd_training.py文件中了,里面定义了一些有用的功能函数,来帮助最终loss计算的,我们就直接看最终计算那个loss的函数:

  1.     def compute_loss(self, y_true, y_pred):
  2.        # 在keras中自定义loss函数,它的两个输入必须为预测的输出和标准的输出
  3. # 变量:
  4. # y_pred: 它的shape为: (?, num_boxes, 4 + num_classes + 8). 就是在model框架部分介绍的输出。
  5.         # y_truth:它的shape和y_pred的shape是一样的,就是上一节我们介绍assignment那一块的输出,具体参考上一节。
  6.         # 返回最终的所有loss总和
  7.         batch_size = tf.shape(y_true)[0]
  8.         num_boxes = tf.to_float(tf.shape(y_true)[1])
  9.         # 计算出所有default box的loss
  10.         conf_loss = self._softmax_loss(y_true[:, :, 4:-8],
  11.                                        y_pred[:, :, 4:-8])
  12.         loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
  13.                                         y_pred[:, :, :4])
  14.         #计算positive 样本的loss
  15.         #num_pos 为一个一维的array:len(num_pos)=batch
  16.         num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)
  17.         ##只需计算存在gt_box与其对应的loss
  18.         pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8],
  19.                                      axis=1)
  20.         pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8],
  21.                                       axis=1)
  22.         #计算negative sample的loss,只计算了confidence loss
  23.         num_neg = tf.minimum(self.neg_pos_ratio * num_pos,
  24.                              num_boxes - num_pos)
  25.         pos_num_neg_mask = tf.greater(num_neg, 0)
  26.         has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
  27.         num_neg = tf.concat(axis=0, values=[num_neg,
  28.                                 [(1 - has_min) * self.negatives_for_hard]])
  29.         #tf.boolen_mask(a,b),例如b=[true, false],a=[[[2,2],[2,3]]],则输出为[2,2]。
  30.         #实际上就是取num_neg为正数的那些元素,然后再在其中取num_neg中的最小的元素作为num_neg_batch。
  31.         num_neg_batch = tf.reduce_min(tf.boolean_mask(num_neg,
  32.                                                       tf.greater(num_neg, 0)))
  33.         num_neg_batch = tf.to_int32(num_neg_batch)
  34.         confs_start = 4 + self.background_label_id + 1
  35.         confs_end = confs_start + self.num_classes - 1
  36.         #max_confs的shape为:(batch, num_prior)
  37.         max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
  38.                                   axis=2)
  39.         #返回负样本的top-K个元素,最终返回的indices的shape为(batch, K=num_neg_batch)
  40.         _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
  41.                                  k=num_neg_batch)
  42.         #创建一个shape也为(batch,num_neg_batch)的indices
  43.         batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
  44.         batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
  45.         #乘以num_boxes后得到batch中每一个sample的index的起始值,再加上top_k得到的index就得到了一个一维的full_indices。
  46.         full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
  47.                         tf.reshape(indices, [-1]))
  48.         #把得到的conf_loss也reshape成一维,然后用full_indices对其进行取值
  49.         neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
  50.                                   full_indices)
  51.         #最终把负样本的confidence loss reshape 成(batch, num_neg_batch),再对每个sample上的loss求和。
  52.         neg_conf_loss = tf.reshape(neg_conf_loss,
  53.                                    [batch_size, num_neg_batch])
  54.         neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)
  55.         #整合所有的loss:positive loss 和 negative loss
  56.         total_loss = pos_conf_loss + neg_conf_loss
  57.         total_loss /= (num_pos + tf.to_float(num_neg_batch))
  58.         num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,
  59.                             tf.ones_like(num_pos))
  60.         total_loss += (self.alpha * pos_loc_loss) / num_pos
  61.         return total_loss

    这时候function loss 也准备好了,属于一切都准备就绪了。当然就是进行训练了。其实在写这篇blog之前我还是对loss function 这块没有太细看明白,写完之后顿时就恍然大悟的,写blog确实是一个自我学习的一个很好过程。

model 进行 training

training这一块是写在SSD_training.ipynb的jupyter notebook文件中的,上面那些model 的部件准备好了之后,training就按照keras的流程照搬就好了。

不过需要注意一下,作者给的这个训练并不是voc数据集的训练,而是对3种瓶子的检测。

1.必要的库和自己编写的模块的导入:

  1. import cv2
  2. import keras
  3. from keras.applications.imagenet_utils import preprocess_input
  4. from keras.backend.tensorflow_backend import set_session
  5. from keras.models import Model
  6. from keras.preprocessing import image
  7. import matplotlib.pyplot as plt
  8. import numpy as np
  9. import pickle
  10. from random import shuffle
  11. from scipy.misc import imread
  12. from scipy.misc import imresize
  13. import tensorflow as tf
  14. from ssd import SSD300
  15. from ssd_training import MultiboxLoss
  16. from ssd_utils import BBoxUtility
  17. %matplotlib inline
  18. plt.rcParams[\'figure.figsize\'] = (8, 8)
  19. plt.rcParams[\'image.interpolation\'] = \'nearest\'
  20. np.set_printoptions(suppress=True)

2.必要的初始化参数和prior box 的读取,以及输入数据的读取:

  1. NUM_CLASSES = 4
  2. input_shape = (300, 300, 3)
  3. #prior_boxes_ssd300.pkl 存放了所有的prior:[xmin, ymin, xmax, ymax,var[0],var[1],var[2],var[3]]
  4. priors = pickle.load(open(\'prior_boxes_ssd300.pkl\', \'rb\'))
  5. bbox_util = BBoxUtility(NUM_CLASSES, priors)
  6. #获得输入数据的file_name、bounding box 和 label
  7. gt = pickle.load(open(\'gt_pascal.pkl\', \'rb\'))
  8. keys = sorted(gt.keys())
  9. num_train = int(round(0.8 * len(keys)))
  10. train_keys = keys[:num_train]
  11. val_keys = keys[num_train:]
  12. num_val = len(val_keys)

3.输入数据和label的generator类定义,有点长,就把generate 那个函数贴出来:

  1. class Generator(object):
  2.     def generate(self, train=True):
  3.         while True:
  4.             if train:
  5.                 shuffle(self.train_keys)
  6.                 keys = self.train_keys
  7.             else:
  8.                 shuffle(self.val_keys)
  9.                 keys = self.val_keys
  10.             inputs = []
  11.             targets = []
  12.             for key in keys:            
  13.                 img_path = self.path_prefix + key
  14.                 img = imread(img_path).astype(\'float32\')
  15.                 y = self.gt[key].copy()
  16.                 ##y的shape是一张图片中box的数目和位置+类别。(num_box, coordinate+num_class)
  17.                 if train and self.do_crop:
  18.                     img, y = self.random_sized_crop(img, y)
  19.                 img = imresize(img, self.image_size).astype(\'float32\')
  20.                 if train:
  21.                     shuffle(self.color_jitter)
  22.                     for jitter in self.color_jitter:
  23.                         img = jitter(img)
  24.                     if self.lighting_std:
  25.                         img = self.lighting(img)
  26.                     if self.hflip_prob > 0:
  27.                         img, y = self.horizontal_flip(img, y)
  28.                     if self.vflip_prob > 0:
  29.                         img, y = self.vertical_flip(img, y)
  30.                 y = self.bbox_util.assign_boxes(y)
  31.                 inputs.append(img)                
  32.                 targets.append(y)
  33.                 if len(targets) == self.batch_size:
  34.                     tmp_inp = np.array(inputs)
  35.                     tmp_targets = np.array(targets)
  36.                     inputs = []
  37.                     targets = []
  38.                     yield preprocess_input(tmp_inp), tmp_targets #batch 生成器

4.必要的初始化

  1. #输入数据(图片)的root directory
  2. path_prefix = \'../../frames/\'
  3. gen = Generator(gt, bbox_util, 16, \'../../frames/\',
  4. train_keys, val_keys,
  5. (input_shape[0], input_shape[1]), do_crop=False)
  6. #构建SSD300的model
  7. model = SSD300(input_shape, num_classes=NUM_CLASSES)
  8. model.load_weights(\'weights_SSD300.hdf5\', by_name=True)
  9. #也没太弄懂,为什么需要把他们给freeze,为啥也对他们train
  10. freeze = [\'input_1\', \'conv1_1\', \'conv1_2\', \'pool1\',
  11. \'conv2_1\', \'conv2_2\', \'pool2\',
  12. \'conv3_1\', \'conv3_2\', \'conv3_3\', \'pool3\']
  13. for L in model.layers:
  14. if L.name in freeze:
  15. L.trainable = False

5.keras的一些callback function的定义以及model的compile and training:

  1. def schedule(epoch, decay=0.9):
  2. return base_lr * decay**(epoch)
  3. callbacks = [keras.callbacks.ModelCheckpoint(\'./checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5\',
  4. verbose=1,
  5. save_weights_only=True),
  6. keras.callbacks.LearningRateScheduler(schedule)]
  7. base_lr = 3e-4
  8. optim = keras.optimizers.Adam(lr=base_lr)
  9. # optim = keras.optimizers.RMSprop(lr=base_lr)
  10. # optim = keras.optimizers.SGD(lr=base_lr, momentum=0.9, decay=decay, nesterov=True)
  11. model.compile(optimizer=optim,
  12. loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=2.0).compute_loss)
  13. nb_epoch = 30
  14. history = model.fit_generator(gen.generate(True), gen.train_batches,
  15. nb_epoch, verbose=1,
  16. callbacks=callbacks,
  17. validation_data=gen.generate(False),
  18. nb_val_samples=gen.val_batches,
  19. nb_worker=1)

6.train完了之后,当然是检测了:

  1. #数据的读取
  2. inputs = []
  3. images = []
  4. img_path = path_prefix + sorted(val_keys)[0]
  5. img = image.load_img(img_path, target_size=(300, 300))
  6. img = image.img_to_array(img)
  7. images.append(imread(img_path))
  8. inputs.append(img.copy())
  9. inputs = preprocess_input(np.array(inputs))
  10. #进行预测和预测后对预测结果的解码
  11. preds = model.predict(inputs, batch_size=1, verbose=1)
  12. results = bbox_util.detection_out(preds)
  13. #可视化预测结果
  14. for i, img in enumerate(images):
  15. # Parse the outputs.
  16. det_label = results[i][:, 0]
  17. det_conf = results[i][:, 1]
  18. det_xmin = results[i][:, 2]
  19. det_ymin = results[i][:, 3]
  20. det_xmax = results[i][:, 4]
  21. det_ymax = results[i][:, 5]
  22. # Get detections with confidence higher than 0.6.
  23. top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6]
  24. top_conf = det_conf[top_indices]
  25. top_label_indices = det_label[top_indices].tolist()
  26. top_xmin = det_xmin[top_indices]
  27. top_ymin = det_ymin[top_indices]
  28. top_xmax = det_xmax[top_indices]
  29. top_ymax = det_ymax[top_indices]
  30. colors = plt.cm.hsv(np.linspace(0, 1, 4)).tolist()
  31. plt.imshow(img / 255.)
  32. currentAxis = plt.gca()
  33. for i in range(top_conf.shape[0]):
  34. xmin = int(round(top_xmin[i] * img.shape[1]))
  35. ymin = int(round(top_ymin[i] * img.shape[0]))
  36. xmax = int(round(top_xmax[i] * img.shape[1]))
  37. ymax = int(round(top_ymax[i] * img.shape[0]))
  38. score = top_conf[i]
  39. label = int(top_label_indices[i])
  40.      #注意这里的label直接使用的数字,因为它train的数据集不是voc,而是几种瓶子的种类。
  41. display_txt = \'{:0.2f}, {}\'.format(score, label)
  42. coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
  43. color = colors[label]
  44. currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
  45. currentAxis.text(xmin, ymin, display_txt, bbox={\'facecolor\':color, \'alpha\':0.5})
  46. plt.show()

7.predict 的结果:


整个过程也就基本上的结束了。SSD的keras实现还是比较简单的,没有mask r-cnn那么费劲。不知道为啥我先看的yolo的原理和实现,但是不太想写yolo的实现和原理(手动白眼),直接跳到了SSD,大概是觉得SSD比较好理解把,yolo等有时间再写吧。

之后我再把生成prior box pkl文件的代码贴上来,自己写的代码有点乱。希望看到了最后你对SDD的模型架构和具体实现都有了一个很好的认识。因为也是一个新手,所以其中有什么理解不到位,或者写错的,欢迎指出。

添加:prior box 的 pkl文件生成代码:其实也很简单,就是稍微修改了一下PriorBox这个自定义的keras layer,把输出用来产生对于特定feature map 大小的 default box:

  1. import numpy as np
  2. class PriorBox():
  3. def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
  4. flip=True, variances=[0.1,0.1,0.2,0.2], clip=True, layer_shape=[8,8],**kwargs):
  5. self.input_shape = layer_shape
  6. self.img_size = img_size
  7. if min_size <= 0:
  8. raise Exception(\'min_size must be positive.\')
  9. self.min_size = min_size
  10. self.max_size = max_size
  11. self.aspect_ratios = [1.0]
  12. if max_size:
  13. if max_size < min_size:
  14. raise Exception(\'max_size must be greater than min_size.\')
  15. self.aspect_ratios.append(1.0)
  16. if aspect_ratios:
  17. for ar in aspect_ratios:
  18. if ar in self.aspect_ratios:
  19. continue
  20. self.aspect_ratios.append(ar)
  21. if flip:
  22. self.aspect_ratios.append(1.0 / ar)
  23. self.variances = np.array(variances)
  24. self.clip = True
  25. super(PriorBox, self).__init__(**kwargs)
  26. def compute_default_box(self):
  27. layer_height = self.input_shape[0]
  28. layer_width = self.input_shape[1]
  29. img_width = self.img_size[0]
  30. img_height = self.img_size[1]
  31. # define prior boxes shapes
  32. box_widths = []
  33. box_heights = []
  34. for ar in self.aspect_ratios:
  35. if ar == 1 and len(box_widths) == 0:
  36. box_widths.append(self.min_size)
  37. box_heights.append(self.min_size)
  38. elif ar == 1 and len(box_widths) > 0:
  39. box_widths.append(np.sqrt(self.min_size * self.max_size))
  40. box_heights.append(np.sqrt(self.min_size * self.max_size))
  41. elif ar != 1:
  42. box_widths.append(self.min_size * np.sqrt(ar))
  43. box_heights.append(self.min_size / np.sqrt(ar))
  44. box_widths = 0.5 * np.array(box_widths)
  45. box_heights = 0.5 * np.array(box_heights)
  46. # define centers of prior boxes
  47. step_x = img_width / layer_width
  48. step_y = img_height / layer_height
  49. #generate a list data
  50. linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
  51. layer_width)
  52. liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
  53. layer_height)
  54. ##ulitize meshgrid function to generate default box\'s coordinates
  55. centers_x, centers_y = np.meshgrid(linx, liny)
  56. centers_x = centers_x.reshape(-1, 1)
  57. centers_y = centers_y.reshape(-1, 1)
  58. # define xmin, ymin, xmax, ymax of prior boxes
  59. num_priors_ = len(self.aspect_ratios)
  60. prior_boxes = np.concatenate((centers_x, centers_y), axis=1)
  61. prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_))
  62. prior_boxes[:, ::4] -= box_widths
  63. prior_boxes[:, 1::4] -= box_heights
  64. prior_boxes[:, 2::4] += box_widths
  65. prior_boxes[:, 3::4] += box_heights
  66. prior_boxes[:, ::2] /= img_width
  67. prior_boxes[:, 1::2] /= img_height
  68. prior_boxes = prior_boxes.reshape(-1, 4)
  69. if self.clip:
  70. prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
  71. # define variances
  72. num_boxes = len(prior_boxes)
  73. if len(self.variances) == 1:
  74. variances = np.ones((num_boxes, 4)) * self.variances[0]
  75. elif len(self.variances) == 4:
  76. variances = np.tile(self.variances, (num_boxes, 1))
  77. else:
  78. raise Exception(\'Must provide one or four variances.\')
  79. prior_boxes = np.concatenate((prior_boxes, variances), axis=1)
  80. return prior_boxes
  81. #调用修改后的PriorBox类
  82. img_size = (300, 300)
  83. default_box_layer1 = PriorBox(img_size, 30, [], aspect_ratios=[2], layer_shape=(38,38)).compute_default_box()
  84. default_box_layer2 = PriorBox(img_size, 60, 114, aspect_ratios=[2,3], layer_shape=(19,19)).compute_default_box()
  85. default_box_layer3 = PriorBox(img_size, 114, 168, aspect_ratios=[2,3], layer_shape=(10,10)).compute_default_box()
  86. default_box_layer4 = PriorBox(img_size, 168, 222, aspect_ratios=[2,3], layer_shape=(5,5)).compute_default_box()
  87. default_box_layer5 = PriorBox(img_size, 222, 276, aspect_ratios=[2,3], layer_shape=(3,3)).compute_default_box()
  88. default_box_layer6 = PriorBox(img_size, 276, 330, aspect_ratios=[2,3], layer_shape=(1,1)).compute_default_box()
  89. #把各层的输出concatenate起来
  90. default_box = np.concatenate((default_box_layer1, default_box_layer2, default_box_layer3,\
  91. default_box_layer4, default_box_layer5, default_box_layer6), axis=0)
  92. #写入到pkl文件中
  93. import pickle
  94. pickle.dump(default_box,open("default_box_information","wb"))

相关文章