文章目录
- RPN 整体代码
- RPN 具体实现过程
- 数据标注
- 读取标注数据
- 固定图片大小调整目标框
- 使用预训练模型获取 feature_shape
- 定义 RPN 网络
- 生成RPN 的 CLS 和 REG 数据集
- 获取所有的锚点
- 计算锚点与目标框的IOU
- 定义 RPN loss 和 训练过程
- 参考资料
这里实现的是二阶段目标检测,其主要由一个RPN框架和ROI框架构成,后者只是一个图片分类任务,前者较为麻烦,这里只实现前者RPN过程
RPN 整体代码
import xml.etree.ElementTree as ET
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
def generate_anchors(sizes = [128, 256, 512], ratios = [[1, 1], [1, 2], [2, 1]]):
num_anchors = len(sizes) * len(ratios)
anchors = np.zeros((num_anchors, 4))
anchors[:, 2:] = np.tile(sizes, (2, len(ratios))).T
for i in range(len(ratios)):
anchors[3 * i: 3 * i + 3, 2] = anchors[3 * i: 3 * i + 3, 2] * ratios[i][0]
anchors[3 * i: 3 * i + 3, 3] = anchors[3 * i: 3 * i + 3, 3] * ratios[i][1]
anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
return anchors
def shift(shape, anchors, stride=16):
shift_x = (np.arange(0, shape[1], dtype=np.float32) + 0.5) * stride
shift_y = (np.arange(0, shape[0], dtype=np.float32) + 0.5) * stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
shift_x = np.reshape(shift_x, [-1])
shift_y = np.reshape(shift_y, [-1])
shifts = np.stack([shift_x, shift_y, shift_x, shift_y], axis=0)
shifts = np.transpose(shifts)
number_of_anchors = np.shape(anchors)[0]
k = np.shape(shifts)[0]
shifted_anchors = np.reshape(anchors, [1, number_of_anchors, 4]) + np.array(np.reshape(shifts, [k, 1, 4]), dtype=np.float32)
shifted_anchors = np.reshape(shifted_anchors, [k * number_of_anchors, 4])
return shifted_anchors
def get_anchors(input_shape, feature_shape, sizes = [128, 256, 512], ratios = [[1, 1], [1, 2], [2, 1]], stride=16):
anchors = generate_anchors(sizes = sizes, ratios = ratios)
anchors = shift(feature_shape, anchors, stride = stride)
anchors[:, ::2] = np.clip(anchors[:, ::2], 0, input_shape[1])
anchors[:, 1::2] = np.clip(anchors[:, 1::2], 0, input_shape[0])
return anchors
%%time
anchors = get_anchors([600,600], [37,37])
anchors
## 数据准备
def get_xml_box(file_path, return_object_name=False):
"""返回的形式类似于:..[filename, object_name, xmin, ymin, xmax, ymax]"""
tree = ET.parse(file_path)
root = tree.getroot()
filename = root.find('filename').text
object_name_list = []
box_list = []
for item in root.iter('object'):
object_name = item.find('name').text
box = item.find('bndbox')
xmin = box.find('xmin').text
ymin = box.find('ymin').text
xmax = box.find('xmax').text
ymax = box.find('ymax').text
object_name_list.append(object_name)
box_list.append([xmin, ymin, xmax, ymax])
return [filename, object_name_list, box_list]
xml_files = ['../data/VOC2007/Annotations/' + xml_file for xml_file in os.listdir('../data/VOC2007/Annotations/') if xml_file.endswith('xml')]
data = [get_xml_box(xml_file) for xml_file in xml_files]
df = pd.DataFrame(data)
df.columns = ['filename', 'object_name_list', 'box_list']
df['filename'] = '../data/VOC2007/JPEGImages/' + df['filename']
df.head()
class_name = set([item for items in df.object_name_list.values.tolist() for item in items])
class_nums = len(class_name) + 1
class_name2index = dict(zip(class_name, range(1, class_nums)))
class_index2name = dict(zip(range(1, class_nums), class_name))
df['object_name_list'] = df['object_name_list'].map(lambda x: [class_name2index[item] for item in x])
df.head()
## 固定图片大小
def get_final_image_and_box(filename, box, input_shape=[600, 600]):
image = Image.open(filename)
box = np.array(box).astype(np.float32)
iw, ih = image.size
h, w = input_shape
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
# 获取final_image
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# 获取final_box
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
return image_data, box
filename = '../data/VOC2007/JPEGImages/000001.jpg'
target_box = [[9, 16, 374, 430], [378, 86, 625, 447]]
input_shape = [600, 600]
image_data, target_box = get_final_image_and_box(filename, target_box, input_shape)
image_data.shape, target_box
def compute_iou(boxes0: np.ndarray, boxes1: np.ndarray):
"""
计算多个边界框和多个边界框的交并比
boxes0: `~np.ndarray` of shape `(A, 4)`
boxes1: `~np.ndarray` of shape `(B, 4)`
Returns iou: `~np.ndarray` of shape `(A, B)`
"""
boxes0 = np.array(boxes0)
boxes1 = np.array(boxes1)
A = boxes0.shape[0]
B = boxes1.shape[0]
xy_max = np.minimum(boxes0[:, np.newaxis, 2:].repeat(B, axis=1),
np.broadcast_to(boxes1[:, 2:], (A, B, 2)))
xy_min = np.maximum(boxes0[:, np.newaxis, :2].repeat(B, axis=1),
np.broadcast_to(boxes1[:, :2], (A, B, 2)))
# 计算交集面积
inter = np.clip(xy_max-xy_min, a_min=0, a_max=np.inf)
inter = inter[:, :, 0]*inter[:, :, 1]
# 计算每个矩阵的面积
area_0 = ((boxes0[:, 2]-boxes0[:, 0])*(
boxes0[:, 3] - boxes0[:, 1]))[:, np.newaxis].repeat(B, axis=1)
area_1 = ((boxes1[:, 2] - boxes1[:, 0])*(
boxes1[:, 3] - boxes1[:, 1]))[np.newaxis, :].repeat(A, axis=0)
return inter/(area_0+area_1-inter)
def get_cls_and_reg_data(anchors, target_box, threshold_min=0.3, threshold_max=0.7, sample_size=256):
positive_iou = compute_iou(anchors, target_box)>threshold_max
negative_iou = compute_iou(anchors, target_box)<threshold_min
positive_cls = np.any(positive_iou, axis=1).astype(np.float32)
negative_cls = np.all(negative_iou, axis=1).astype(np.float32)
positive_index = np.random.choice(np.where(positive_cls==1)[0], size=sample_size)
negative_index = np.random.choice(np.where(negative_cls==1)[0], size=sample_size)
rpn_cls = np.concatenate([positive_index, negative_index], axis=0)
rpn_reg = [np.where(positive_iou[:,ix]==True)[0].tolist() for ix in range(len(target_box))]
return rpn_cls, rpn_reg
class RPN(tf.keras.Model):
def __init__(self, num_anchors):
super(RPN, self).__init__()
self.get_feature_model = tf.keras.applications.vgg16.VGG16(include_top=False, input_shape=[600, 600, 3])
self.get_feature_model = tf.keras.models.Model(inputs=self.get_feature_model.input, outputs=self.get_feature_model.layers[-2].output)
self.get_feature_model.trainable = False
self.conv_base = tf.keras.layers.Conv2D(512, (3, 3), padding='same', activation='relu', name='rpn_conv1')
self.conv_class = tf.keras.layers.Conv2D(num_anchors, (1, 1), activation='sigmoid', name='rpn_out_class')
self.conv_regr = tf.keras.layers.Conv2D(num_anchors * 4, (1, 1), activation='linear', name='rpn_out_regress')
self.flatten = tf.keras.layers.Flatten()
def call(self, x):
x = self.get_feature_model(x)
x = self.conv_base(x)
x_cls = self.flatten(self.conv_class(x))
x_reg = tf.reshape(self.conv_regr(x), [tf.shape(x)[0], -1, 4])
x_reg = tf.transpose(x_reg, perm=[0, 2, 1])
return x_cls, x_reg
rpn = RPN(9)
x = np.stack([image_data,image_data])
y = [[[9, 16, 374, 430], [378, 86, 625, 447]], [[9, 16, 374, 430]