Python深度学习基于Tensorflow(13)目标检测实战

时间:2024-06-08 07:08:03

文章目录

  • RPN 整体代码
  • RPN 具体实现过程
    • 数据标注
    • 读取标注数据
    • 固定图片大小调整目标框
    • 使用预训练模型获取 feature_shape
    • 定义 RPN 网络
    • 生成RPN 的 CLS 和 REG 数据集
      • 获取所有的锚点
      • 计算锚点与目标框的IOU
    • 定义 RPN loss 和 训练过程
  • 参考资料

请添加图片描述

这里实现的是二阶段目标检测,其主要由一个RPN框架和ROI框架构成,后者只是一个图片分类任务,前者较为麻烦,这里只实现前者RPN过程

RPN 整体代码

import xml.etree.ElementTree as ET
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

def generate_anchors(sizes = [128, 256, 512], ratios = [[1, 1], [1, 2], [2, 1]]):
    num_anchors = len(sizes) * len(ratios)

    anchors = np.zeros((num_anchors, 4))
    anchors[:, 2:] = np.tile(sizes, (2, len(ratios))).T
    
    for i in range(len(ratios)):
        anchors[3 * i: 3 * i + 3, 2] = anchors[3 * i: 3 * i + 3, 2] * ratios[i][0]
        anchors[3 * i: 3 * i + 3, 3] = anchors[3 * i: 3 * i + 3, 3] * ratios[i][1]
    
    anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
    anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
    return anchors

def shift(shape, anchors, stride=16):
    shift_x = (np.arange(0, shape[1], dtype=np.float32) + 0.5) * stride
    shift_y = (np.arange(0, shape[0], dtype=np.float32) + 0.5) * stride

    shift_x, shift_y = np.meshgrid(shift_x, shift_y)

    shift_x = np.reshape(shift_x, [-1])
    shift_y = np.reshape(shift_y, [-1])

    shifts = np.stack([shift_x, shift_y, shift_x, shift_y], axis=0)

    shifts = np.transpose(shifts)
    number_of_anchors = np.shape(anchors)[0]

    k = np.shape(shifts)[0]

    shifted_anchors = np.reshape(anchors, [1, number_of_anchors, 4]) + np.array(np.reshape(shifts, [k, 1, 4]), dtype=np.float32)
    shifted_anchors = np.reshape(shifted_anchors, [k * number_of_anchors, 4])
    return shifted_anchors

def get_anchors(input_shape, feature_shape, sizes = [128, 256, 512], ratios = [[1, 1], [1, 2], [2, 1]], stride=16):
    anchors = generate_anchors(sizes = sizes, ratios = ratios)
    anchors = shift(feature_shape, anchors, stride = stride)
    anchors[:, ::2] = np.clip(anchors[:, ::2], 0, input_shape[1])
    anchors[:, 1::2] = np.clip(anchors[:, 1::2], 0, input_shape[0])
    return anchors

%%time
anchors = get_anchors([600,600], [37,37])

anchors

## 数据准备

def get_xml_box(file_path, return_object_name=False):
    """返回的形式类似于:..[filename, object_name, xmin, ymin, xmax, ymax]"""
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    filename = root.find('filename').text
    object_name_list = []
    box_list = []
    
    for item in root.iter('object'):
        object_name = item.find('name').text
        
        box = item.find('bndbox')
        xmin = box.find('xmin').text
        ymin = box.find('ymin').text
        xmax = box.find('xmax').text
        ymax = box.find('ymax').text
        
        object_name_list.append(object_name)
        box_list.append([xmin, ymin, xmax, ymax])
        
    return [filename, object_name_list, box_list]

xml_files = ['../data/VOC2007/Annotations/' + xml_file for xml_file in os.listdir('../data/VOC2007/Annotations/') if xml_file.endswith('xml')]
data = [get_xml_box(xml_file) for xml_file in xml_files]
df = pd.DataFrame(data)
df.columns = ['filename', 'object_name_list', 'box_list']
df['filename'] = '../data/VOC2007/JPEGImages/' + df['filename']

df.head()

class_name = set([item for items in df.object_name_list.values.tolist() for item in items])
class_nums = len(class_name) + 1
class_name2index = dict(zip(class_name, range(1, class_nums)))
class_index2name = dict(zip(range(1, class_nums), class_name))

df['object_name_list'] = df['object_name_list'].map(lambda x: [class_name2index[item] for item in x])

df.head()

## 固定图片大小

def get_final_image_and_box(filename, box, input_shape=[600, 600]):
    image = Image.open(filename)
    box = np.array(box).astype(np.float32)
    iw, ih = image.size
    h, w = input_shape
    
    scale = min(w/iw, h/ih)
    nw = int(iw*scale)
    nh = int(ih*scale)
    dx = (w-nw)//2
    dy = (h-nh)//2

    # 获取final_image
    image = image.resize((nw,nh), Image.BICUBIC)
    new_image = Image.new('RGB', (w,h), (128,128,128))
    new_image.paste(image, (dx, dy))
    image_data  = np.array(new_image, np.float32)
    
    # 获取final_box
    box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
    box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
    box[:, 0:2][box[:, 0:2]<0] = 0
    box[:, 2][box[:, 2]>w] = w
    box[:, 3][box[:, 3]>h] = h
    box_w = box[:, 2] - box[:, 0]
    box_h = box[:, 3] - box[:, 1]
    box = box[np.logical_and(box_w>1, box_h>1)]

    return image_data, box

filename = '../data/VOC2007/JPEGImages/000001.jpg'
target_box = [[9, 16, 374, 430], [378, 86, 625, 447]]
input_shape = [600, 600]

image_data, target_box = get_final_image_and_box(filename, target_box, input_shape)
image_data.shape, target_box

def compute_iou(boxes0: np.ndarray, boxes1: np.ndarray):
    """ 
    计算多个边界框和多个边界框的交并比
    boxes0: `~np.ndarray` of shape `(A, 4)`
    boxes1: `~np.ndarray` of shape `(B, 4)`
    Returns iou: `~np.ndarray` of shape `(A, B)`
    """
    boxes0 = np.array(boxes0)
    boxes1 = np.array(boxes1)
    A = boxes0.shape[0]
    B = boxes1.shape[0]

    xy_max = np.minimum(boxes0[:, np.newaxis, 2:].repeat(B, axis=1),
                        np.broadcast_to(boxes1[:, 2:], (A, B, 2)))
    xy_min = np.maximum(boxes0[:, np.newaxis, :2].repeat(B, axis=1),
                        np.broadcast_to(boxes1[:, :2], (A, B, 2)))

    # 计算交集面积
    inter = np.clip(xy_max-xy_min, a_min=0, a_max=np.inf)
    inter = inter[:, :, 0]*inter[:, :, 1]

    # 计算每个矩阵的面积
    area_0 = ((boxes0[:, 2]-boxes0[:, 0])*(
        boxes0[:, 3] - boxes0[:, 1]))[:, np.newaxis].repeat(B, axis=1)
    area_1 = ((boxes1[:, 2] - boxes1[:, 0])*(
        boxes1[:, 3] - boxes1[:, 1]))[np.newaxis, :].repeat(A, axis=0)

    return inter/(area_0+area_1-inter)

def get_cls_and_reg_data(anchors, target_box, threshold_min=0.3, threshold_max=0.7, sample_size=256):
    positive_iou = compute_iou(anchors, target_box)>threshold_max
    negative_iou = compute_iou(anchors, target_box)<threshold_min
    positive_cls = np.any(positive_iou, axis=1).astype(np.float32)
    negative_cls = np.all(negative_iou, axis=1).astype(np.float32)
    positive_index = np.random.choice(np.where(positive_cls==1)[0], size=sample_size)
    negative_index = np.random.choice(np.where(negative_cls==1)[0], size=sample_size)
    rpn_cls = np.concatenate([positive_index, negative_index], axis=0)
    rpn_reg = [np.where(positive_iou[:,ix]==True)[0].tolist() for ix in range(len(target_box))]
    return rpn_cls, rpn_reg

class RPN(tf.keras.Model):
    def __init__(self, num_anchors):
        super(RPN, self).__init__()
        self.get_feature_model = tf.keras.applications.vgg16.VGG16(include_top=False, input_shape=[600, 600, 3])
        self.get_feature_model = tf.keras.models.Model(inputs=self.get_feature_model.input, outputs=self.get_feature_model.layers[-2].output)
        self.get_feature_model.trainable = False
        self.conv_base = tf.keras.layers.Conv2D(512, (3, 3), padding='same', activation='relu', name='rpn_conv1')
        self.conv_class = tf.keras.layers.Conv2D(num_anchors, (1, 1), activation='sigmoid', name='rpn_out_class')
        self.conv_regr  = tf.keras.layers.Conv2D(num_anchors * 4, (1, 1), activation='linear', name='rpn_out_regress')
        self.flatten = tf.keras.layers.Flatten()

    def call(self, x):
        x = self.get_feature_model(x)
        x = self.conv_base(x)
        x_cls = self.flatten(self.conv_class(x))
        x_reg = tf.reshape(self.conv_regr(x), [tf.shape(x)[0], -1, 4])
        x_reg = tf.transpose(x_reg, perm=[0, 2, 1])
        return x_cls, x_reg

rpn = RPN(9)

x = np.stack([image_data,image_data])
y = [[[9, 16, 374, 430], [378, 86, 625, 447]], [[9, 16, 374, 430]