- RPN 整体代码
- RPN 具体实现过程
- 数据标注
- 读取标注数据
- 固定图片大小调整目标框
- 使用预训练模型获取 feature_shape
- 定义 RPN 网络
- 生成RPN 的 CLS 和 REG 数据集
- 获取所有的锚点
- 计算锚点与目标框的IOU
- 定义 RPN loss 和 训练过程
- 参考资料
import xml.etree.ElementTree as ET
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
def generate_anchors(sizes = [128, 256, 512], ratios = [[1, 1], [1, 2], [2, 1]]):
num_anchors = len(sizes) * len(ratios)
anchors = np.zeros((num_anchors, 4))
anchors[:, 2:] = np.tile(sizes, (2, len(ratios))).T
for i in range(len(ratios)):
anchors[3 * i: 3 * i + 3, 2] = anchors[3 * i: 3 * i + 3, 2] * ratios[i][0]
anchors[3 * i: 3 * i + 3, 3] = anchors[3 * i: 3 * i + 3, 3] * ratios[i][1]
anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
return anchors
def shift(shape, anchors, stride=16):
shift_x = (np.arange(0, shape[1], dtype=np.float32) + 0.5) * stride
shift_y = (np.arange(0, shape[0], dtype=np.float32) + 0.5) * stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
shift_x = np.reshape(shift_x, [-1])
shift_y = np.reshape(shift_y, [-1])
shifts = np.stack([shift_x, shift_y, shift_x, shift_y], axis=0)
shifts = np.transpose(shifts)
number_of_anchors = np.shape(anchors)[0]
k = np.shape(shifts)[0]
shifted_anchors = np.reshape(anchors, [1, number_of_anchors, 4]) + np.array(np.reshape(shifts, [k, 1, 4]), dtype=np.float32)
shifted_anchors = np.reshape(shifted_anchors, [k * number_of_anchors, 4])
return shifted_anchors
def get_anchors(input_shape, feature_shape, sizes = [128, 256, 512], ratios = [[1, 1], [1, 2], [2, 1]], stride=16):
anchors = generate_anchors(sizes = sizes, ratios = ratios)
anchors = shift(feature_shape, anchors, stride = stride)
anchors[:, ::2] = np.clip(anchors[:, ::2], 0, input_shape[1])
anchors[:, 1::2] = np.clip(anchors[:, 1::2], 0, input_shape[0])
return anchors
anchors = get_anchors([600,600], [37,37])
## 数据准备
def get_xml_box(file_path, return_object_name=False):
"""返回的形式类似于:..[filename, object_name, xmin, ymin, xmax, ymax]"""
tree = ET.parse(file_path)
root = tree.getroot()
filename = root.find('filename').text
object_name_list = []
box_list = []
for item in root.iter('object'):
object_name = item.find('name').text
box = item.find('bndbox')
xmin = box.find('xmin').text
ymin = box.find('ymin').text
xmax = box.find('xmax').text
ymax = box.find('ymax').text
box_list.append([xmin, ymin, xmax, ymax])
return [filename, object_name_list, box_list]
xml_files = ['../data/VOC2007/Annotations/' + xml_file for xml_file in os.listdir('../data/VOC2007/Annotations/') if xml_file.endswith('xml')]
data = [get_xml_box(xml_file) for xml_file in xml_files]
df = pd.DataFrame(data)
df.columns = ['filename', 'object_name_list', 'box_list']
df['filename'] = '../data/VOC2007/JPEGImages/' + df['filename']
class_name = set([item for items in df.object_name_list.values.tolist() for item in items])
class_nums = len(class_name) + 1
class_name2index = dict(zip(class_name, range(1, class_nums)))
class_index2name = dict(zip(range(1, class_nums), class_name))
df['object_name_list'] = df['object_name_list'].map(lambda x: [class_name2index[item] for item in x])
## 固定图片大小
def get_final_image_and_box(filename, box, input_shape=[600, 600]):
image = Image.open(filename)
box = np.array(box).astype(np.float32)
iw, ih = image.size
h, w = input_shape
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
# 获取final_image
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# 获取final_box
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
return image_data, box
filename = '../data/VOC2007/JPEGImages/000001.jpg'
target_box = [[9, 16, 374, 430], [378, 86, 625, 447]]
input_shape = [600, 600]
image_data, target_box = get_final_image_and_box(filename, target_box, input_shape)
image_data.shape, target_box
def compute_iou(boxes0: np.ndarray, boxes1: np.ndarray):
boxes0: `~np.ndarray` of shape `(A, 4)`
boxes1: `~np.ndarray` of shape `(B, 4)`
Returns iou: `~np.ndarray` of shape `(A, B)`
boxes0 = np.array(boxes0)
boxes1 = np.array(boxes1)
A = boxes0.shape[0]
B = boxes1.shape[0]
xy_max = np.minimum(boxes0[:, np.newaxis, 2:].repeat(B, axis=1),
np.broadcast_to(boxes1[:, 2:], (A, B, 2)))
xy_min = np.maximum(boxes0[:, np.newaxis, :2].repeat(B, axis=1),
np.broadcast_to(boxes1[:, :2], (A, B, 2)))
# 计算交集面积
inter = np.clip(xy_max-xy_min, a_min=0, a_max=np.inf)
inter = inter[:, :, 0]*inter[:, :, 1]
# 计算每个矩阵的面积
area_0 = ((boxes0[:, 2]-boxes0[:, 0])*(
boxes0[:, 3] - boxes0[:, 1]))[:, np.newaxis].repeat(B, axis=1)
area_1 = ((boxes1[:, 2] - boxes1[:, 0])*(
boxes1[:, 3] - boxes1[:, 1]))[np.newaxis, :].repeat(A, axis=0)
return inter/(area_0+area_1-inter)
def get_cls_and_reg_data(anchors, target_box, threshold_min=0.3, threshold_max=0.7, sample_size=256):
positive_iou = compute_iou(anchors, target_box)>threshold_max
negative_iou = compute_iou(anchors, target_box)<threshold_min
positive_cls = np.any(positive_iou, axis=1).astype(np.float32)
negative_cls = np.all(negative_iou, axis=1).astype(np.float32)
positive_index = np.random.choice(np.where(positive_cls==1)[0], size=sample_size)
negative_index = np.random.choice(np.where(negative_cls==1)[0], size=sample_size)
rpn_cls = np.concatenate([positive_index, negative_index], axis=0)
rpn_reg = [np.where(positive_iou[:,ix]==True)[0].tolist() for ix in range(len(target_box))]
return rpn_cls, rpn_reg
class RPN(tf.keras.Model):
def __init__(self, num_anchors):
super(RPN, self).__init__()
self.get_feature_model = tf.keras.applications.vgg16.VGG16(include_top=False, input_shape=[600, 600, 3])
self.get_feature_model = tf.keras.models.Model(inputs=self.get_feature_model.input, outputs=self.get_feature_model.layers[-2].output)
self.get_feature_model.trainable = False
self.conv_base = tf.keras.layers.Conv2D(512, (3, 3), padding='same', activation='relu', name='rpn_conv1')
self.conv_class = tf.keras.layers.Conv2D(num_anchors, (1, 1), activation='sigmoid', name='rpn_out_class')
self.conv_regr = tf.keras.layers.Conv2D(num_anchors * 4, (1, 1), activation='linear', name='rpn_out_regress')
self.flatten = tf.keras.layers.Flatten()
def call(self, x):
x = self.get_feature_model(x)
x = self.conv_base(x)
x_cls = self.flatten(self.conv_class(x))
x_reg = tf.reshape(self.conv_regr(x), [tf.shape(x)[0], -1, 4])
x_reg = tf.transpose(x_reg, perm=[0, 2, 1])
return x_cls, x_reg
rpn = RPN(9)
x = np.stack([image_data,image_data])
y = [[[9, 16, 374, 430], [378, 86, 625, 447]], [[9, 16, 374, 430]