RPN
关于faster rcnn中PRN的介绍大家可以自己看paper或者找点论坛看看, medium, CSDN, 知乎, 包括简书都有大量的资料做介绍, 本文只站在源码的角度给你介绍每一步的实现, 所以就不阐述原理了,见谅~~
代码入口
lib/model/train_val.py# Construct the computation graph lr, train_op = self.construct_graph(sess)
lr是学习率, train_op是训练网络的一系列操作。
让我们走进construct_graph函数
lib/model/train_val.py def construct_graph(self, sess): with sess.graph.as_default(): # Set the random seed for tensorflow tf.set_random_seed(cfg.RNG_SEED) # Build the main computation graph layers = self.net.create_architecture('TRAIN', self.imdb.num_classes, tag='default', anchor_scales=cfg.ANCHOR_SCALES, anchor_ratios=cfg.ANCHOR_RATIOS) # Define the loss loss = layers['total_loss'] # Set learning rate and momentum lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False) self.optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) # Compute the gradients with regard to the loss gvs = self.optimizer.compute_gradients(loss) # Double the gradient of the bias if set if cfg.TRAIN.DOUBLE_BIAS: final_gvs = [] with tf.variable_scope('Gradient_Mult') as scope: for grad, var in gvs: scale = 1. if cfg.TRAIN.DOUBLE_BIAS and '/biases:' in var.name: scale *= 2. if not np.allclose(scale, 1.0): grad = tf.multiply(grad, scale) final_gvs.append((grad, var)) train_op = self.optimizer.apply_gradients(final_gvs) else: train_op = self.optimizer.apply_gradients(gvs) # We will handle the snapshots ourselves self.saver = tf.train.Saver(max_to_keep=100000) # Write the train and validation information to tensorboard self.writer = tf.summary.FileWriter(self.tbdir, sess.graph) self.valwriter = tf.summary.FileWriter(self.tbvaldir) return lr, train_op
代码其实将流程阐述的非常清楚,我再废话给大家总结一下~~
给tensorflow设置随机种子seed(为啥要这样,可以百度一下)
建立一个计算图computational graph(重点,下面介绍)
定义了一个执行Momentum算法的优化器
accumulation = momentum * accumulation + gradient
variable -= learning_rate * accumulation
计算损失参数的梯度self.optimizer.compute_gradients(loss)
将梯度应用于变量self.optimizer.apply_gradients(gvs), 返回值就是train_op
定义Saver(用于快照-缓存), writer, valwriter(把信息及时传入tensorboard)
然后走进create_architecture函数
lib/nets/network.py def create_architecture(self, mode, num_classes, tag=None, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)): self._image = tf.placeholder(tf.float32, shape=[1, None, None, 3]) self._im_info = tf.placeholder(tf.float32, shape=[3]) self._gt_boxes = tf.placeholder(tf.float32, shape=[None, 5]) self._tag = tag self._num_classes = num_classes self._mode = mode self._anchor_scales = anchor_scales self._num_scales = len(anchor_scales) self._anchor_ratios = anchor_ratios self._num_ratios = len(anchor_ratios) self._num_anchors = self._num_scales * self._num_ratios training = mode == 'TRAIN' testing = mode == 'TEST' assert tag != None # handle most of the regularizers here weights_regularizer = tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY) if cfg.TRAIN.BIAS_DECAY: biases_regularizer = weights_regularizer else: biases_regularizer = tf.no_regularizer # list as many types of layers as possible, even if they are not used now with arg_scope([slim.conv2d, slim.conv2d_in_plane, \ slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected], weights_regularizer=weights_regularizer, biases_regularizer=biases_regularizer, biases_initializer=tf.constant_initializer(0.0)): rois, cls_prob, bbox_pred = self._build_network(training) layers_to_output = {'rois': rois} for var in tf.trainable_variables(): self._train_summaries.append(var) if testing: stds = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (self._num_classes)) means = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (self._num_classes)) self._predictions["bbox_pred"] *= stds self._predictions["bbox_pred"] += means else: self._add_losses() layers_to_output.update(self._losses) val_summaries = [] with tf.device("/cpu:0"): val_summaries.append(self._add_gt_image_summary()) for key, var in self._event_summaries.items(): val_summaries.append(tf.summary.scalar(key, var)) for key, var in self._score_summaries.items(): self._add_score_summary(key, var) for var in self._act_summaries: self._add_act_summary(var) for var in self._train_summaries: self._add_train_summary(var) self._summary_op = tf.summary.merge_all() self._summary_op_val = tf.summary.merge(val_summaries) layers_to_output.update(self._predictions) return layers_to_output
很多人(包括我自己)对tensorflow还不是很熟悉,所以这里还是给大家概括一下程序流程
给network的成员变量赋值
定义权重weights的正则regularizer
建立网络self._build_network(training) (重点)
定义损失函数, 包括RPN class loss, RPN bbox loss,整个RCNN网络的class loss和最终确定的物体边框bbox loss, 细节可以看这个函数_add_losses
更新一下tensorboard用得到的参数
然后我们了解一下_build_network函数
lib/nets/network.py def _build_network(self, is_training=True): # select initializers if cfg.TRAIN.TRUNCATED: initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01) initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001) else: initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01) initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001) net_conv = self._image_to_head(is_training) with tf.variable_scope(self._scope, self._scope): # build the anchors for the image self._anchor_component() # region proposal network rois = self._region_proposal(net_conv, is_training, initializer) # region of interest pooling if cfg.POOLING_MODE == 'crop': pool5 = self._crop_pool_layer(net_conv, rois, "pool5") else: raise NotImplementedError fc7 = self._head_to_tail(pool5, is_training) with tf.variable_scope(self._scope, self._scope): # region classification cls_prob, bbox_pred = self._region_classification(fc7, is_training, initializer, initializer_bbox) self._score_summaries.update(self._predictions) return rois, cls_prob, bbox_pred
初始化权重weight, 用截断的normal initializer或者随机的normal initializer
构建主干网络前端_image_to_head
构建anchors
构建RPN
ROI pooling 调用函数_crop_pool_layer
构建主干网络的尾部 fc7 = self._head_to_tail(pool5, is_training)
object分类以及边框预测的回归
各位是不是一脸萌币。。。不要紧, 下面我会给大家详细介绍上述的每一个步骤。
构建主干网络前端
_image_to_head方法是一个类Network的一个abstract class, 以它的实现类Resnet 101为例
lib/nets/resnet_v1.py def _image_to_head(self, is_training, reuse=None): assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3) # Now the base is always fixed during training with slim.arg_scope(resnet_arg_scope(is_training=False)): net_conv = self._build_base() if cfg.RESNET.FIXED_BLOCKS > 0: with slim.arg_scope(resnet_arg_scope(is_training=False)): net_conv, _ = resnet_v1.resnet_v1(net_conv, self._blocks[0:cfg.RESNET.FIXED_BLOCKS], global_pool=False, include_root_block=False, reuse=reuse, scope=self._scope) if cfg.RESNET.FIXED_BLOCKS < 3: with slim.arg_scope(resnet_arg_scope(is_training=is_training)): net_conv, _ = resnet_v1.resnet_v1(net_conv, self._blocks[cfg.RESNET.FIXED_BLOCKS:-1], global_pool=False, include_root_block=False, reuse=reuse, scope=self._scope) self._act_summaries.append(net_conv) self._layers['head'] = net_conv return net_conv def _build_base(self): with tf.variable_scope(self._scope, self._scope): net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1') net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]]) net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1')
我会在下一篇文章中专门介绍resnet, 这里还是只做一个流程的简介。
调用_build_base函数手动建立初始的几层: input -> 64 * 7 * 7 filters, stride = 2 -> padding -> max pooling
构建网络主干, 因为之前定义过self._blocks
self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), resnet_v1_block('block2', base_depth=128, num_units=8, stride=2), # use stride 1 for the last conv4 layer resnet_v1_block('block3', base_depth=256, num_units=36, stride=1), resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
调用slim的resnet_v1的接口实现这段网络resnet_v1.resnet_v1()
构建anchors
anchor 是什么?这里借用一点知乎作者马塔的回答:
anchor的本质是什么,本质是将相同尺寸的 conv5_3 层的输出,倒推得到不同尺寸的输入。接下来是anchor的窗口尺寸,详细说下这个尺寸的来源,最基本的anchor只有一个尺寸,是16*16的尺寸,然后设定了基本的面积scale是(8,16,32),用这三个scale乘以16就得到了三个面积尺寸(1282,2562,512^2),然后在每个面积尺寸下,取三种不同的长宽比例(1:1,1:2,2:1).这样一来,我们得到了一共9种面积尺寸各异的anchor。示意图如下:
不过这个示意图其实比较有误导性,首先它图中的9个框并不是在同一个中心点的,而实际上,是应该在每个特征图的每个点作为中心点生成9 个框; 其次,生成的 anchor 尺寸大小不是以特征图为基准的,甚至毫无关系,而是以 anchor ratio 和 anchor scale 得到最终的大小,并且其最大的 anchor 也基本和 resize 之后的图大小相当。
在 generate_anchors 代码文件中,可以看到如下数据,
# anchors =# -83 -39 100 56# -175 -87 192 104# -359 -183 376 200# -55 -55 72 72# -119 -119 136 136# -247 -247 264 264# -35 -79 52 96# -79 -167 96 184# -167 -343 184 360
这就是生成的最基本的9个anchor,这个anchor的坐标是xyxy类型的,它表示了图片的左上角的第1个9个anchor的坐标,后面用到的所有anchor都是用它在特征图上平移得到的(它代表的坐标是resize 后的图片坐标而不是原图)。
至于这个anchor到底是怎么用的,这个是理解整个问题的关键。
上面我们已经得到了基础网络最终的conv5_3 输出为138671024(1024是层数),在这个特征参数的基础上,通过一个3x3的滑动窗口,在这个3867的区域上进行滑动,stride=1,padding=2,这样一来,滑动得到的就是3867个3x3的窗口。
对于每个3x3的窗口,计算这个滑动窗口的中心点所对应的原始图片的中心点。然后作者假定,这个3x3窗口,是从原始图片上通过SPP池化得到的,而这个池化的区域的面积以及比例,就是一个个的anchor。换句话说,对于每个3x3窗口,作者假定它来自9种不同原始区域的池化,但是这些池化在原始图片中的中心点,都完全一样。这个中心点,就是刚才提到的,3x3窗口中心点所对应的原始图片中的中心点。如此一来,在每个窗口位置,我们都可以根据9个不同长宽比例、不同面积的anchor,逆向推导出它所对应的原始图片中的一个区域,这个区域的尺寸以及坐标,都是已知的。而这个区域,就是我们想要的 proposal。所以我们通过滑动窗口和anchor,成功得到了 3867x9 个原始图片的proposal。接下来,每个proposal我们只输出6个参数:每个 proposal 和 ground truth 进行比较得到的前景概率和背景概率(2个参数)(对应 cls_score);由于每个 proposal 和 ground truth 位置及尺寸上的差异,从 proposal 通过平移放缩得到 ground truth 需要的4个平移放缩参数(对应 bbox_pred)。
加上一点我的理解,anchor 是用来做多尺度的目标检测的,它是用来代替图像金字塔和特征金字塔的,它为什么可以达到这样的目的?可以看看它的最后一层的输出是 MN(92), 如果我们只看它在特征图 MN 个特征点的第一个点的第一个卷积核,它代表了什么含义?它相当于用这个卷积核去综合图片该点附近(33,上一步进行了33的卷积)的信息,判断有没有第一个尺寸的目标,也就是说每个卷积核都负责了一个尺寸的目标检测,那么18个卷积核,每2个负责一个任务,就达到了多尺度目标检测的目的,很巧妙的一个思路,从最终的效果来看,它实际上就是一个多尺度的目标热力图,或者用作者的话说,就相当于一个‘注意力’机制。
另外值得提出的是这里使用的是全卷积结构(33的卷积,然后接11的卷积),也就是说 M*N 也是一个二维结构,和原图的像素二维结构是对应的,那么我们就能相应的判断出该特征点对应的原图是否存在目标。 个人感觉,理解了这里的全卷积结构和 anchor 的机制,整个 faster rcnn 就明晰很多了。
最后明确的一点就是在代码中,anchor,proposal,rois ,boxes 代表的含义其实都是一样的,都是推荐的区域或者框,不过有所区别的地方在于这几个名词有一个递进的关系,最开始的是锚定的框 anchor,数量最多有约20000个(根据resize后的图片大小不同而有数量有所变化),然后是RPN网络推荐的框 proposal,数量较多,train时候有2000个,最后是实际分类时候用到的 rois 框,每张图片有256个;最后得到的结果就是 boxes。
好了, 以上就是转自知乎对于anchor的一个详细解释,我知道大家是来看代码的
入口在network.py的 _build_network函数中
# build the anchors for the image self._anchor_component() ...... ...... def _anchor_component(self): with tf.variable_scope('ANCHOR_' + self._tag) as scope: # just to get the shape right height = tf.to_int32(tf.ceil(self._im_info[0] / np.float32(self._feat_stride[0]))) width = tf.to_int32(tf.ceil(self._im_info[1] / np.float32(self._feat_stride[0]))) if cfg.USE_E2E_TF: anchors, anchor_length = generate_anchors_pre_tf( height, width, self._feat_stride, self._anchor_scales, self._anchor_ratios ) else: anchors, anchor_length = tf.py_func(generate_anchors_pre, [height, width, self._feat_stride, self._anchor_scales, self._anchor_ratios], [tf.float32, tf.int32], name="generate_anchors") anchors.set_shape([None, 4]) anchor_length.set_shape([]) self._anchors = anchors self._anchor_length = anchor_length
首先计算好偏移量
生成初始的9个anchor
lib/layer_utils/snippets.pydef generate_anchors_pre_tf(height, width, feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)): shift_x = tf.range(width) * feat_stride # width shift_y = tf.range(height) * feat_stride # height shift_x, shift_y = tf.meshgrid(shift_x, shift_y) sx = tf.reshape(shift_x, shape=(-1,)) sy = tf.reshape(shift_y, shape=(-1,)) shifts = tf.transpose(tf.stack([sx, sy, sx, sy])) K = tf.multiply(width, height) shifts = tf.transpose(tf.reshape(shifts, shape=[1, K, 4]), perm=(1, 0, 2)) anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales)) A = anchors.shape[0] anchor_constant = tf.constant(anchors.reshape((1, A, 4)), dtype=tf.int32) length = K * A anchors_tf = tf.reshape(tf.add(anchor_constant, shifts), shape=(length, 4)) return tf.cast(anchors_tf, dtype=tf.float32), length
生成anchor的代码一目了然,这个脚本对系统环境没有要求,所以大家也可以直接运行该文件,打点断点调试,就会很清楚整个流程。
lib/layer_utils/generate_anchors.pydef generate_anchors(base_size=16, ratios=[0.5, 1, 2], scales=2 ** np.arange(3, 6)): """ Generate anchor (reference) windows by enumerating aspect ratios X scales wrt a reference (0, 0, 15, 15) window. """ base_anchor = np.array([1, 1, base_size, base_size]) - 1 ratio_anchors = _ratio_enum(base_anchor, ratios) anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) for i in range(ratio_anchors.shape[0])]) return anchorsdef _whctrs(anchor): """ Return width, height, x center, and y center for an anchor (window). """ w = anchor[2] - anchor[0] + 1 h = anchor[3] - anchor[1] + 1 x_ctr = anchor[0] + 0.5 * (w - 1) y_ctr = anchor[1] + 0.5 * (h - 1) return w, h, x_ctr, y_ctrdef _mkanchors(ws, hs, x_ctr, y_ctr): """ Given a vector of widths (ws) and heights (hs) around a center (x_ctr, y_ctr), output a set of anchors (windows). """ ws = ws[:, np.newaxis] hs = hs[:, np.newaxis] anchors = np.hstack((x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1))) return anchorsdef _ratio_enum(anchor, ratios): """ Enumerate a set of anchors for each aspect ratio wrt an anchor. """ w, h, x_ctr, y_ctr = _whctrs(anchor) size = w * h size_ratios = size / ratios ws = np.round(np.sqrt(size_ratios)) hs = np.round(ws * ratios) anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchorsdef _scale_enum(anchor, scales): """ Enumerate a set of anchors for each scale wrt an anchor. """ w, h, x_ctr, y_ctr = _whctrs(anchor) ws = w * scales hs = h * scales anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors
构建RPN
RPN层利用anchors在图片上的滑动,与256(512)个3*3的滑窗做卷积,生成全连接层。其主要作用有二:
预测proposal的中心锚点对应的坐标x,y以及宽高w,h
判断proposal区域是前景还是背景
代码入口
# region proposal network rois = self._region_proposal(net_conv, is_training, initializer)
lib/nets/network.py def _region_proposal(self, net_conv, is_training, initializer): rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], trainable=is_training, weights_initializer=initializer, scope="rpn_conv/3x3") self._act_summaries.append(rpn) rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training, weights_initializer=initializer, padding='VALID', activation_fn=None, scope='rpn_cls_score') # change it so that the score has 2 as its channel size rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape') rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape") rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name="rpn_cls_pred") rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob") rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training, weights_initializer=initializer, padding='VALID', activation_fn=None, scope='rpn_bbox_pred') if is_training: rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor") # Try to have a deterministic order for the computing graph, for reproducibility with tf.control_dependencies([rpn_labels]): rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois") else: if cfg.TEST.MODE == 'nms': rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") elif cfg.TEST.MODE == 'top': rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois") else: raise NotImplementedError self._predictions["rpn_cls_score"] = rpn_cls_score self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape self._predictions["rpn_cls_prob"] = rpn_cls_prob self._predictions["rpn_cls_pred"] = rpn_cls_pred self._predictions["rpn_bbox_pred"] = rpn_bbox_pred self._predictions["rois"] = rois return rois
_region_proposal这段代码逻辑看细节的话会有点绕,所以先回顾一下整个faster rcnn的流程图
_region_proposal 函数分别做了以下几件事:
将特征图[60 * 40 * 256] (取决于原始图片的像素还有缩放scale还有选择的特征提取网络) 与 256个 3*3 的flters卷积(近一步提取特征)得到[60 * 40 * 9]的图
与18个11的filters做卷积,也就是92,对应着每个像素9个anchor,乘以2表示每个anchor对应2个scores,分别表示前景或者背景。通过reshape -> softmax -> reshape 获取了目标是否是物体的预测以及得分。输出的参数有两个rpn_cls_pred(预测结果),rpn_cls_prob(前景和背景的概率)。需要监督的信息是Y=0,1,表示这个区域是否是ground truth。
与36个1*1的filters做卷积,也就是 9 * 4, 得到Anchor Box的坐标信息,其实是偏移量
ground truth:标定的框也对应一个中心点位置坐标x,y和宽高w,h
anchor box: 中心点位置坐标x_a,y_a和宽高w_a,h_a
所以,偏移量:
x=(x-x_a)/w_a y=(y-y_a)/h_a
w=log(w/w_a) h=log(h/h_a)
通过ground truth box与预测的anchor box之间的差异来进行学习,从而是RPN网络中的权重能够学习到预测box的能力
rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois"), 这一步获取2000个proposals
rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor"),计算每个anchor的label值
rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois"),选取__C.TRAIN.BATCH_SIZE个正样本和负样本作为训练的一个mini batch。这一步的作用是给PRN提供的proposals分配标签,计算proposals和ground truth boxes的偏移量,用于网络最后一层(bbox_pred)回归参数的学习。
接下来详细为大家介绍一下步骤4,5,6
4、rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
lib/nets/network.py def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred, name): with tf.variable_scope(name) as scope: if cfg.USE_E2E_TF: rois, rpn_scores = proposal_layer_tf( rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode, self._feat_stride, self._anchors, self._num_anchors ) else: rois, rpn_scores = tf.py_func(proposal_layer, [rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode, self._feat_stride, self._anchors, self._num_anchors], [tf.float32, tf.float32], name="proposal") rois.set_shape([None, 5]) rpn_scores.set_shape([None, 1]) return rois, rpn_scores
proposal_layer_tf 的流程如下
从config文件读取配置 post_nms_topN(执行NMS算法后proposal的数量), nms_thresh(NMS 阈值)
原始anchor给出的proposal通过学习参数rpn_bbox_pred,转换为与ground truth接近的边框,裁剪掉超出图片的部分
执行NMS算法,获取最终的proposals
用下图一个案例来对NMS算法进行简单介绍
如上图所示,一共有6个识别为人的框,每一个框有一个置信率。
现在需要消除多余的:
按置信率排序: 0.95, 0.9, 0.9, 0.8, 0.7, 0.7
取最大0.95的框为一个物体框
剩余5个框中,去掉与0.95框重叠率IoU大于0.6(可以另行设置),则保留0.9, 0.8, 0.7三个框
重复上面的步骤,直到没有框了,0.9为一个框
选出来的为: 0.95, 0.9
lib/layer_utils/proposal_layer.pydef proposal_layer_tf(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors): if type(cfg_key) == bytes: cfg_key = cfg_key.decode('utf-8') pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # Get the scores and bounding boxes scores = rpn_cls_prob[:, :, :, num_anchors:] scores = tf.reshape(scores, shape=(-1,)) rpn_bbox_pred = tf.reshape(rpn_bbox_pred, shape=(-1, 4)) proposals = bbox_transform_inv_tf(anchors, rpn_bbox_pred) proposals = clip_boxes_tf(proposals, im_info[:2]) # Non-maximal suppression indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topN, iou_threshold=nms_thresh) boxes = tf.gather(proposals, indices) boxes = tf.to_float(boxes) scores = tf.gather(scores, indices) scores = tf.reshape(scores, shape=(-1, 1)) # Only support single image as input batch_inds = tf.zeros((tf.shape(indices)[0], 1), dtype=tf.float32) blob = tf.concat([batch_inds, boxes], 1) return blob, scores
bbox_transform_inv_tf: 每个anchor的边框学习之前得到的偏移量(这里的偏移量就是需要学习的rpn_bbox_pred)做位移和缩放,获取最终的预测边框。也就是将原始proposal A, 通过学习rpn_bbox_pred中的参数,得到一个与ground truth G 相近的预测边框 G'。
clip_boxes_tf: 剪裁掉超出原始图片边框的部分。
lib/model/bbox_transform.pydef bbox_transform_inv_tf(boxes, deltas): boxes = tf.cast(boxes, deltas.dtype) widths = tf.subtract(boxes[:, 2], boxes[:, 0]) + 1.0 heights = tf.subtract(boxes[:, 3], boxes[:, 1]) + 1.0 ctr_x = tf.add(boxes[:, 0], widths * 0.5) ctr_y = tf.add(boxes[:, 1], heights * 0.5) dx = deltas[:, 0] dy = deltas[:, 1] dw = deltas[:, 2] dh = deltas[:, 3] pred_ctr_x = tf.add(tf.multiply(dx, widths), ctr_x) pred_ctr_y = tf.add(tf.multiply(dy, heights), ctr_y) pred_w = tf.multiply(tf.exp(dw), widths) pred_h = tf.multiply(tf.exp(dh), heights) pred_boxes0 = tf.subtract(pred_ctr_x, pred_w * 0.5) pred_boxes1 = tf.subtract(pred_ctr_y, pred_h * 0.5) pred_boxes2 = tf.add(pred_ctr_x, pred_w * 0.5) pred_boxes3 = tf.add(pred_ctr_y, pred_h * 0.5) return tf.stack([pred_boxes0, pred_boxes1, pred_boxes2, pred_boxes3], axis=1)def clip_boxes_tf(boxes, im_info): b0 = tf.maximum(tf.minimum(boxes[:, 0], im_info[1] - 1), 0) b1 = tf.maximum(tf.minimum(boxes[:, 1], im_info[0] - 1), 0) b2 = tf.maximum(tf.minimum(boxes[:, 2], im_info[1] - 1), 0) b3 = tf.maximum(tf.minimum(boxes[:, 3], im_info[0] - 1), 0) return tf.stack([b0, b1, b2, b3], axis=1)
5、rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
这一步要给 self._anchors 中的所有 anchor 赋以一个 label 值:
1 : 正样本
0 : 负样本
-1 : 非样本, 不用于训练
同时初始化一些参数用于后面计算损失函数(关于损失函数, 大家可以去看https://blog.csdn.net/wfei101/article/details/79809332,我觉得讲的比论文清楚的多):rpn_bbox_targets: PRN网络边框回归的ground truth
rpn_bbox_inside_weights: label为1的行,也就是目标区域为前景的行,参数为[1.0 1.0 1.0 1.0],其余为0
rpn_bbox_outside_weights: label为0或者1的行,参数为[1.0 1.0 1.0 1.0] / len(fg+bg)
lib/nets/network.py def _anchor_target_layer(self, rpn_cls_score, name): with tf.variable_scope(name) as scope: rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func( anchor_target_layer, [rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors], [tf.float32, tf.float32, tf.float32, tf.float32], name="anchor_target") rpn_labels.set_shape([1, 1, None, None]) rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4]) rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors * 4]) rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors * 4]) rpn_labels = tf.to_int32(rpn_labels, name="to_int32") self._anchor_targets['rpn_labels'] = rpn_labels self._anchor_targets['rpn_bbox_targets'] = rpn_bbox_targets self._anchor_targets['rpn_bbox_inside_weights'] = rpn_bbox_inside_weights self._anchor_targets['rpn_bbox_outside_weights'] = rpn_bbox_outside_weights self._score_summaries.update(self._anchor_targets) return rpn_labels
lib/layer_utils/anchor_target_layer.pydef anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors): """Same as the anchor target layer in original Fast/er RCNN """ A = num_anchors total_anchors = all_anchors.shape[0] K = total_anchors / num_anchors # allow boxes to sit over the edge by a small amount _allowed_border = 0 # map of shape (..., H, W) height, width = rpn_cls_score.shape[1:3] # only keep anchors inside the image inds_inside = np.where( (all_anchors[:, 0] >= -_allowed_border) & (all_anchors[:, 1] >= -_allowed_border) & (all_anchors[:, 2] < im_info[1] + _allowed_border) & # width (all_anchors[:, 3] < im_info[0] + _allowed_border) # height )[0] # keep only inside anchors anchors = all_anchors[inds_inside, :] # label: 1 is positive, 0 is negative, -1 is dont care labels = np.empty((len(inds_inside),), dtype=np.float32) labels.fill(-1) # overlaps between the anchors and the gt boxes # overlaps (ex, gt) overlaps = bbox_overlaps( np.ascontiguousarray(anchors, dtype=np.float), np.ascontiguousarray(gt_boxes, dtype=np.float)) argmax_overlaps = overlaps.argmax(axis=1) max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] gt_argmax_overlaps = overlaps.argmax(axis=0) gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels first so that positive labels can clobber them # first set the negatives labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # fg label: for each gt, anchor with highest overlap labels[gt_argmax_overlaps] = 1 # fg label: above threshold IOU labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 if cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels last so that negative labels can clobber positives labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # subsample positive labels if we have too many num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] if len(fg_inds) > num_fg: disable_inds = npr.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) labels[disable_inds] = -1 # subsample negative labels if we have too many num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) bg_inds = np.where(labels == 0)[0] if len(bg_inds) > num_bg: disable_inds = npr.choice( bg_inds, size=(len(bg_inds) - num_bg), replace=False) labels[disable_inds] = -1 bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) # only the positive ones have regression targets bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: # uniform weighting of examples (given non-uniform sampling) num_examples = np.sum(labels >= 0) positive_weights = np.ones((1, 4)) * 1.0 / num_examples negative_weights = np.ones((1, 4)) * 1.0 / num_examples else: assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1)) negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0)) bbox_outside_weights[labels == 1, :] = positive_weights bbox_outside_weights[labels == 0, :] = negative_weights # map up to original set of anchors labels = _unmap(labels, total_anchors, inds_inside, fill=-1) bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) # labels labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) labels = labels.reshape((1, 1, A * height, width)) rpn_labels = labels # bbox_targets bbox_targets = bbox_targets \ .reshape((1, height, width, A * 4)) rpn_bbox_targets = bbox_targets # bbox_inside_weights bbox_inside_weights = bbox_inside_weights \ .reshape((1, height, width, A * 4)) rpn_bbox_inside_weights = bbox_inside_weights # bbox_outside_weights bbox_outside_weights = bbox_outside_weights \ .reshape((1, height, width, A * 4)) rpn_bbox_outside_weights = bbox_outside_weights return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weightsdef _compute_targets(ex_rois, gt_rois): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 5 return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
lib/model/bbox_transform.pydef bbox_transform(ex_rois, gt_rois): ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights targets_dw = np.log(gt_widths / ex_widths) targets_dh = np.log(gt_heights / ex_heights) targets = np.vstack( (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() return targets
6、rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
从之前RPN给出的2000个proposal中选出__C.TRAIN.BATCH_SIZE(128, 其中25%是前景, 75%是背景)作为训练的一批。
lib/nets/network.py def _proposal_target_layer(self, rois, roi_scores, name): with tf.variable_scope(name) as scope: rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func( proposal_target_layer, [rois, roi_scores, self._gt_boxes, self._num_classes], [tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32], name="proposal_target") rois.set_shape([cfg.TRAIN.BATCH_SIZE, 5]) roi_scores.set_shape([cfg.TRAIN.BATCH_SIZE]) labels.set_shape([cfg.TRAIN.BATCH_SIZE, 1]) bbox_targets.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) bbox_inside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) bbox_outside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) self._proposal_targets['rois'] = rois self._proposal_targets['labels'] = tf.to_int32(labels, name="to_int32") self._proposal_targets['bbox_targets'] = bbox_targets self._proposal_targets['bbox_inside_weights'] = bbox_inside_weights self._proposal_targets['bbox_outside_weights'] = bbox_outside_weights self._score_summaries.update(self._proposal_targets) return rois, roi_scores
lib/layer_utils/proposal_target_layer.pydef proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes): """ Assign object detection proposals to ground-truth targets. Produces proposal classification labels and bounding-box regression targets. """ # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN # (i.e., rpn.proposal_layer.ProposalLayer), or any other source all_rois = rpn_rois all_scores = rpn_scores # Include ground-truth boxes in the set of candidate rois if cfg.TRAIN.USE_GT: zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) all_rois = np.vstack( (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) ) # not sure if it a wise appending, but anyway i am not using it all_scores = np.vstack((all_scores, zeros)) num_images = 1 rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) # Sample rois with classification labels and bounding box regression # targets labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois( all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, _num_classes) rois = rois.reshape(-1, 5) roi_scores = roi_scores.reshape(-1) labels = labels.reshape(-1, 1) bbox_targets = bbox_targets.reshape(-1, _num_classes * 4) bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4) bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32) return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weightsdef _get_bbox_regression_labels(bbox_target_data, num_classes): """Bounding-box regression targets (bbox_target_data) are stored in a compact form N x (class, tx, ty, tw, th) This function expands those targets into the 4-of-4*K representation used by the network (i.e. only one class has non-zero targets). Returns: bbox_target (ndarray): N x 4K blob of regression targets bbox_inside_weights (ndarray): N x 4K blob of loss weights """ clss = bbox_target_data[:, 0] bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) inds = np.where(clss > 0)[0] for ind in inds: cls = clss[ind] start = int(4 * cls) end = start + 4 bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS return bbox_targets, bbox_inside_weightsdef _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 targets = bbox_transform(ex_rois, gt_rois) if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) return np.hstack( (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # overlaps: (rois x gt_boxes) overlaps = bbox_overlaps( np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) labels = gt_boxes[gt_assignment, 4] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Small modification to the original version where we ensure a fixed number of regions are sampled if fg_inds.size > 0 and bg_inds.size > 0: fg_rois_per_image = min(fg_rois_per_image, fg_inds.size) fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False) bg_rois_per_image = rois_per_image - fg_rois_per_image to_replace = bg_inds.size < bg_rois_per_image bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace) elif fg_inds.size > 0: to_replace = fg_inds.size < rois_per_image fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace) fg_rois_per_image = rois_per_image elif bg_inds.size > 0: to_replace = bg_inds.size < rois_per_image bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace) fg_rois_per_image = 0 else: import pdb pdb.set_trace() # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 labels[int(fg_rois_per_image):] = 0 rois = all_rois[keep_inds] roi_scores = all_scores[keep_inds] bbox_target_data = _compute_targets( rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) bbox_targets, bbox_inside_weights = \ _get_bbox_regression_labels(bbox_target_data, num_classes) return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
ROI层
这一层的输入是原始图片经过特征网络处理过的feature map和rois。
ROI主要做了3件事:
因为rois是在原始图片下的坐标,我们第一步需要将rois的坐标映射到feature map上
将映射后的坐标分为大小相等sections
对每个section做max pooling
代码还是很清晰的
lib/nets/network.py def _crop_pool_layer(self, bottom, rois, name): with tf.variable_scope(name) as scope: batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1]) # Get the normalized coordinates of bounding boxes bottom_shape = tf.shape(bottom) height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0]) width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0]) x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height # Won't be back-propagated to rois anyway, but to save time bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1)) pre_pool_size = cfg.POOLING_SIZE * 2 crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops") return slim.max_pool2d(crops, [2, 2], padding='SAME')
bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1)) 这行代码主要是为了节约计算时间,因为反向传播到这一步就停止了。
crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops") 这行代码就是将feature map分为14 * 14的section,主要是为了适配不同size图片的feature map。
return slim.max_pool2d(crops, [2, 2], padding='SAME') 这行代码给出了返回结果,实现了filter为2 * 2的max pooling, 所以输出的结果是一个7 * 7的图。
构建全连接层
fc7 = self._head_to_tail(pool5, is_training)
下面代码的实现不用解释了吧,就是用resnet_v1.resnet_v1实现了一个FC
lib/nets/resnet_v1.py def _head_to_tail(self, pool5, is_training, reuse=None): with slim.arg_scope(resnet_arg_scope(is_training=is_training)): fc7, _ = resnet_v1.resnet_v1(pool5, self._blocks[-1:], global_pool=False, include_root_block=False, reuse=reuse, scope=self._scope) # average pooling done by reduce_mean fc7 = tf.reduce_mean(fc7, axis=[1, 2]) return fc7
区域分类和目标边框的回归
cls_prob, bbox_pred = self._region_classification(fc7, is_training,
initializer, initializer_bbox)
这段代码就是整个网络的额输出部分。
lib/nets/network.py def _region_classification(self, fc7, is_training, initializer, initializer_bbox): cls_score = slim.fully_connected(fc7, self._num_classes, weights_initializer=initializer, trainable=is_training, activation_fn=None, scope='cls_score') cls_prob = self._softmax_layer(cls_score, "cls_prob") cls_pred = tf.argmax(cls_score, axis=1, name="cls_pred") bbox_pred = slim.fully_connected(fc7, self._num_classes * 4, weights_initializer=initializer_bbox, trainable=is_training, activation_fn=None, scope='bbox_pred') self._predictions["cls_score"] = cls_score self._predictions["cls_pred"] = cls_pred self._predictions["cls_prob"] = cls_prob self._predictions["bbox_pred"] = bbox_pred return cls_prob, bbox_pred
到此整个faster rcnn的训练部分已经结束,testing部分的代码容易一些,跟训练也差不多,本文就不赘述了。
作者:PlutusTech
链接:https://www.jianshu.com/p/6797b29c91bf