AlexNet原理和Tensorflow实现

时间:2022-03-16 13:50:49

参考博客:https://blog.csdn.net/taoyanqi8932/article/details/71081390
参考代码:https://github.com/hjptriplebee/AlexNet_with_tensorflow

原理

1. Alex网络结构

AlexNet原理和Tensorflow实现
整个网络有8个需要训练的层,前5个为卷积层,最后3层为全连接层.

第一个卷积层

输入的图片大小为:224*224*3

第一个卷积层为:11*11*96即尺寸为11*11,有96个卷积核,步长为4,卷积层后跟ReLU,因此输出的尺寸为 224/4=56,去掉边缘为55,因此其输出的每个feature map 为 55*55*96,同时后面跟LRN层,尺寸不变.

最大池化层,核大小为3*3,步长为2,因此feature map的大小为:27*27*96.

第二层卷积层

输入的tensor为27*27*96

卷积和的大小为: 5*5*256,步长为1,尺寸不会改变,同样紧跟ReLU,和LRN层.

最大池化层,和大小为3*3,步长为2,因此feature map为:13*13*256

第三层至第五层卷积层

输入的tensor为13*13*256

第三层卷积为 3*3*384,步长为1,加上ReLU

第四层卷积为 3*3*384,步长为1,加上ReLU

第五层卷积为 3*3*256,步长为1,加上ReLU

第五层后跟最大池化层,核大小3*3,步长为2,因此feature map:6*6*256

第六层至第八层全连接层

接下来的三层为全连接层,分别为:
1. FC : 4096 + ReLU
2. FC:4096 + ReLU
3. FC: 1000

最后一层为softmax为1000类的概率值.

2. AlexNet中的trick

AlexNet将CNN用到了更深更宽的网络中,其效果分类的精度更高相比于以前的LeNet,其中有一些trick是必须要知道的.

ReLU的应用

AlexNet使用ReLU代替了Sigmoid,其能更快的训练,同时解决sigmoid在训练较深的网络中出现的梯度消失,或者说梯度弥散的问题.

Dropout随机失活

随机忽略一些神经元,以避免过拟合,

重叠的最大池化层

在以前的CNN中普遍使用平均池化层,AlexNet全部使用最大池化层,避免了平均池化层的模糊化的效果,并且步长比池化的核的尺寸小,这样池化层的输出之间有重叠,提升了特征的丰富性.

提出了LRN层

局部响应归一化,对局部神经元创建了竞争的机制,使得其中响应小打的值变得更大,并抑制反馈较小的.

使用了GPU加速计算

使用了gpu加速神经网络的训练

数据增强

使用数据增强的方法缓解过拟合现象.

代码模仿了https://github.com/Zehaos/MobileNet/blob/master/nets/alexnet.py 然后需要的模型文件需要的自己下载,利用模型文件可以对图像进行预测并把结果标定在图像中

#coding = utf-8
import tensorflow as tf
import numpy as np

def maxPoolLayer(x, KHeight, KWidth, strideX, strideY, name, padding="SAME"):
    return tf.nn.max_pool(x, ksize=[1,KHeight,KWidth,1], strides=[1,strideX,strideY,1], padding=padding, name=name)
def dropout(x, keepPro, name=None):
    return tf.nn.dropout(x, keepPro, name)
def LRN(x, R, alpha, beta, name=None, bias=1.0):
    return tf.nn.local_response_normalization(x,depth_radius=R,alpha=alpha,beta=beta,bias=bias,name=name)
def fcLayer(x, inputD, outputD, reluFlag, name):
    with tf.variable_scope(name) as scope:
        w = tf.get_variable("w", shape=[inputD,outputD], dtype="float")
        b = tf.get_variable("b", [outputD], dtype="float")
        out = tf.nn.xw_plus_b(x,w,b,name=scope.name)
        if(reluFlag):
            return tf.nn.relu(out)
        else:
            return out
def convLayer(x, kHeight, kWidth, strideX, strideY,featureNum, name, padding = "SAME", groups = 1):
    channel = int(x.get_shape()[-1])
    conv = lambda a, b: tf.nn.conv2d(a, b, strides = [1, strideY, strideX, 1], padding = padding)
    with tf.variable_scope(name) as scope:
        w = tf.get_variable("w", shape = [kHeight, kWidth, channel/groups, featureNum])
        b = tf.get_variable("b", shape = [featureNum])
        xNew = tf.split(value = x, num_or_size_splits = groups, axis = 3)
        wNew = tf.split(value = w, num_or_size_splits = groups, axis = 3)
        featureMap = [conv(t1, t2) for t1, t2 in zip(xNew, wNew)]
        mergeFeatureMap = tf.concat(axis = 3, values = featureMap)
        # print mergeFeatureMap.shape
        out = tf.nn.bias_add(mergeFeatureMap, b)
    return tf.nn.relu(tf.reshape(out, mergeFeatureMap.get_shape().as_list()), name = scope.name)
class alexNet(object):
    def __init__(self, x, keepPro, classNum, skip, modelPath="bvlc_alexnet.npy"):
        self.X = x
        self.KEEPPRO = keepPro
        self.CLASSNUM = classNum
        self.SKIP = skip
        self.MODELPATH = modelPath
        self.buildCNN()
    def buildCNN(self):
        conv1 = convLayer(self.X, 11, 11, 4, 4, 96, "conv1", "VALID")
        print(conv1)
        lrn1 = LRN(conv1, 2, 2e-05, 0.75, "norm1")
        print(lrn1)
        pool1 = maxPoolLayer(lrn1, 3, 3, 2, 2, "pool1", "VALID")
        print(pool1)
        conv2 = convLayer(pool1, 5, 5, 1, 1, 256, "conv2", groups=2)
        print(conv2)
        lrn2 = LRN(conv2, 2, 2e-05, 0.75, "lrn2")
        print(lrn2)
        pool2 = maxPoolLayer(lrn2, 3, 3, 2, 2, "pool2", "VALID")
        print(pool2)

        conv3 = convLayer(pool2, 3, 3, 1, 1, 384, "conv3")
        print(conv3)
        conv4 = convLayer(conv3, 3, 3, 1, 1, 384, "conv4", groups=2)
        print(conv4)
        conv5 = convLayer(conv4, 3, 3, 1, 1, 256, "conv5", groups=2)
        print(conv5)
        pool5 = maxPoolLayer(conv5, 3, 3, 2, 2, "pool5", "VALID")
        print(pool5)
        fcIn = tf.reshape(pool5, [-1, 256 * 6 * 6])
        fc1 = fcLayer(fcIn, 256 * 6 * 6, 4096, True, "fc6")
        dropout1 = dropout(fc1, self.KEEPPRO)

        fc2 = fcLayer(dropout1, 4096, 4096, True, "fc7")
        dropout2 = dropout(fc2, self.KEEPPRO)

        self.fc3 = fcLayer(dropout2, 4096, self.CLASSNUM, True, "fc8")

    def loadModel(self, sess):
        wDict = np.load(self.MODELPATH, encoding="bytes").item()
        for name in wDict:
            if(name not in self.SKIP):
                with tf.variable_scope(name, reuse=True):
                    for p in wDict[name]:
                        if(len(p.shape)==1):
                            #bias
                            sess.run(tf.get_variable('b', trainable=False).assign(p))
                        else:
                            sess.run(tf.get_variable('w', trainable=False).assign(p))

#test
import os
import sys
import cv2
import caffe_classes

dropoutPro = 1
classNum = 1000
skip = []
testPath = "testModel"
testImg = []
for f in os.listdir(testPath):
    testImg.append(cv2.imread(testPath+"/"+f))

imgMean = np.array([104,117,124], np.float)
x = tf.placeholder("float", [1,228,228,3])

model = alexNet(x, dropoutPro, classNum, skip)
score = model.fc3
softmax = tf.nn.softmax(score)

#init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    model.loadModel(sess)
    for key, img in enumerate(testImg):
        test = cv2.resize(img.astype(np.float), (228, 228))
        test -= imgMean
        maxx = np.argmax(sess.run(softmax, feed_dict={x:test.reshape(1,228,228,3)}))
        res = caffe_classes.class_names[maxx]
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(img, res, (int(img.shape[0] / 3), int(img.shape[1] / 3)), font, 1, (0, 255, 0), 2)
        cv2.imshow("demo", img)
        cv2.waitKey(0)