参考博客:https://blog.csdn.net/taoyanqi8932/article/details/71081390
参考代码:https://github.com/hjptriplebee/AlexNet_with_tensorflow
原理
1. Alex网络结构
整个网络有8个需要训练的层,前5个为卷积层,最后3层为全连接层.
第一个卷积层
输入的图片大小为:224*224*3
第一个卷积层为:11*11*96即尺寸为11*11,有96个卷积核,步长为4,卷积层后跟ReLU,因此输出的尺寸为 224/4=56,去掉边缘为55,因此其输出的每个feature map 为 55*55*96,同时后面跟LRN层,尺寸不变.
最大池化层,核大小为3*3,步长为2,因此feature map的大小为:27*27*96.
第二层卷积层
输入的tensor为27*27*96
卷积和的大小为: 5*5*256,步长为1,尺寸不会改变,同样紧跟ReLU,和LRN层.
最大池化层,和大小为3*3,步长为2,因此feature map为:13*13*256
第三层至第五层卷积层
输入的tensor为13*13*256
第三层卷积为 3*3*384,步长为1,加上ReLU
第四层卷积为 3*3*384,步长为1,加上ReLU
第五层卷积为 3*3*256,步长为1,加上ReLU
第五层后跟最大池化层,核大小3*3,步长为2,因此feature map:6*6*256
第六层至第八层全连接层
接下来的三层为全连接层,分别为:
1. FC : 4096 + ReLU
2. FC:4096 + ReLU
3. FC: 1000
最后一层为softmax为1000类的概率值.
2. AlexNet中的trick
AlexNet将CNN用到了更深更宽的网络中,其效果分类的精度更高相比于以前的LeNet,其中有一些trick是必须要知道的.
ReLU的应用
AlexNet使用ReLU代替了Sigmoid,其能更快的训练,同时解决sigmoid在训练较深的网络中出现的梯度消失,或者说梯度弥散的问题.
Dropout随机失活
随机忽略一些神经元,以避免过拟合,
重叠的最大池化层
在以前的CNN中普遍使用平均池化层,AlexNet全部使用最大池化层,避免了平均池化层的模糊化的效果,并且步长比池化的核的尺寸小,这样池化层的输出之间有重叠,提升了特征的丰富性.
提出了LRN层
局部响应归一化,对局部神经元创建了竞争的机制,使得其中响应小打的值变得更大,并抑制反馈较小的.
使用了GPU加速计算
使用了gpu加速神经网络的训练
数据增强
使用数据增强的方法缓解过拟合现象.
代码模仿了https://github.com/Zehaos/MobileNet/blob/master/nets/alexnet.py 然后需要的模型文件需要的自己下载,利用模型文件可以对图像进行预测并把结果标定在图像中
#coding = utf-8
import tensorflow as tf
import numpy as np
def maxPoolLayer(x, KHeight, KWidth, strideX, strideY, name, padding="SAME"):
return tf.nn.max_pool(x, ksize=[1,KHeight,KWidth,1], strides=[1,strideX,strideY,1], padding=padding, name=name)
def dropout(x, keepPro, name=None):
return tf.nn.dropout(x, keepPro, name)
def LRN(x, R, alpha, beta, name=None, bias=1.0):
return tf.nn.local_response_normalization(x,depth_radius=R,alpha=alpha,beta=beta,bias=bias,name=name)
def fcLayer(x, inputD, outputD, reluFlag, name):
with tf.variable_scope(name) as scope:
w = tf.get_variable("w", shape=[inputD,outputD], dtype="float")
b = tf.get_variable("b", [outputD], dtype="float")
out = tf.nn.xw_plus_b(x,w,b,name=scope.name)
if(reluFlag):
return tf.nn.relu(out)
else:
return out
def convLayer(x, kHeight, kWidth, strideX, strideY,featureNum, name, padding = "SAME", groups = 1):
channel = int(x.get_shape()[-1])
conv = lambda a, b: tf.nn.conv2d(a, b, strides = [1, strideY, strideX, 1], padding = padding)
with tf.variable_scope(name) as scope:
w = tf.get_variable("w", shape = [kHeight, kWidth, channel/groups, featureNum])
b = tf.get_variable("b", shape = [featureNum])
xNew = tf.split(value = x, num_or_size_splits = groups, axis = 3)
wNew = tf.split(value = w, num_or_size_splits = groups, axis = 3)
featureMap = [conv(t1, t2) for t1, t2 in zip(xNew, wNew)]
mergeFeatureMap = tf.concat(axis = 3, values = featureMap)
# print mergeFeatureMap.shape
out = tf.nn.bias_add(mergeFeatureMap, b)
return tf.nn.relu(tf.reshape(out, mergeFeatureMap.get_shape().as_list()), name = scope.name)
class alexNet(object):
def __init__(self, x, keepPro, classNum, skip, modelPath="bvlc_alexnet.npy"):
self.X = x
self.KEEPPRO = keepPro
self.CLASSNUM = classNum
self.SKIP = skip
self.MODELPATH = modelPath
self.buildCNN()
def buildCNN(self):
conv1 = convLayer(self.X, 11, 11, 4, 4, 96, "conv1", "VALID")
print(conv1)
lrn1 = LRN(conv1, 2, 2e-05, 0.75, "norm1")
print(lrn1)
pool1 = maxPoolLayer(lrn1, 3, 3, 2, 2, "pool1", "VALID")
print(pool1)
conv2 = convLayer(pool1, 5, 5, 1, 1, 256, "conv2", groups=2)
print(conv2)
lrn2 = LRN(conv2, 2, 2e-05, 0.75, "lrn2")
print(lrn2)
pool2 = maxPoolLayer(lrn2, 3, 3, 2, 2, "pool2", "VALID")
print(pool2)
conv3 = convLayer(pool2, 3, 3, 1, 1, 384, "conv3")
print(conv3)
conv4 = convLayer(conv3, 3, 3, 1, 1, 384, "conv4", groups=2)
print(conv4)
conv5 = convLayer(conv4, 3, 3, 1, 1, 256, "conv5", groups=2)
print(conv5)
pool5 = maxPoolLayer(conv5, 3, 3, 2, 2, "pool5", "VALID")
print(pool5)
fcIn = tf.reshape(pool5, [-1, 256 * 6 * 6])
fc1 = fcLayer(fcIn, 256 * 6 * 6, 4096, True, "fc6")
dropout1 = dropout(fc1, self.KEEPPRO)
fc2 = fcLayer(dropout1, 4096, 4096, True, "fc7")
dropout2 = dropout(fc2, self.KEEPPRO)
self.fc3 = fcLayer(dropout2, 4096, self.CLASSNUM, True, "fc8")
def loadModel(self, sess):
wDict = np.load(self.MODELPATH, encoding="bytes").item()
for name in wDict:
if(name not in self.SKIP):
with tf.variable_scope(name, reuse=True):
for p in wDict[name]:
if(len(p.shape)==1):
#bias
sess.run(tf.get_variable('b', trainable=False).assign(p))
else:
sess.run(tf.get_variable('w', trainable=False).assign(p))
#test
import os
import sys
import cv2
import caffe_classes
dropoutPro = 1
classNum = 1000
skip = []
testPath = "testModel"
testImg = []
for f in os.listdir(testPath):
testImg.append(cv2.imread(testPath+"/"+f))
imgMean = np.array([104,117,124], np.float)
x = tf.placeholder("float", [1,228,228,3])
model = alexNet(x, dropoutPro, classNum, skip)
score = model.fc3
softmax = tf.nn.softmax(score)
#init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
model.loadModel(sess)
for key, img in enumerate(testImg):
test = cv2.resize(img.astype(np.float), (228, 228))
test -= imgMean
maxx = np.argmax(sess.run(softmax, feed_dict={x:test.reshape(1,228,228,3)}))
res = caffe_classes.class_names[maxx]
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.putText(img, res, (int(img.shape[0] / 3), int(img.shape[1] / 3)), font, 1, (0, 255, 0), 2)
cv2.imshow("demo", img)
cv2.waitKey(0)