forked from facebookresearch/Detectron
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_proposals.py
197 lines (177 loc) · 8.53 KB
/
generate_proposals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# Copyright (c) 2017-present, Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##############################################################################
#
# Based on:
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------
import numpy as np
from detectron.core.config import cfg
import detectron.utils.boxes as box_utils
class GenerateProposalsOp(object):
"""Output object detection proposals by applying estimated bounding-box
transformations to a set of regular boxes (called "anchors").
See comment in utils/boxes:bbox_transform_inv for details abouts the
optional `reg_weights` parameter.
"""
def __init__(self, anchors, spatial_scale, train, reg_weights=(1.0, 1.0, 1.0, 1.0)):
self._anchors = anchors
self._num_anchors = self._anchors.shape[0]
self._feat_stride = 1. / spatial_scale
self._train = train
self._reg_weights = reg_weights
def forward(self, inputs, outputs):
"""See modeling.detector.GenerateProposals for inputs/outputs
documentation.
"""
# 1. for each location i in a (H, W) grid:
# generate A anchor boxes centered on cell i
# apply predicted bbox deltas to each of the A anchors at cell i
# 2. clip predicted boxes to image
# 3. remove predicted boxes with either height or width < threshold
# 4. sort all (proposal, score) pairs by score from highest to lowest
# 5. take the top pre_nms_topN proposals before NMS
# 6. apply NMS with a loose threshold (0.7) to the remaining proposals
# 7. take after_nms_topN proposals after NMS
# 8. return the top proposals
# predicted probability of fg object for each RPN anchor
scores = inputs[0].data
# predicted achors transformations
bbox_deltas = inputs[1].data
# input image (height, width, scale), in which scale is the scale factor
# applied to the original dataset image to get the network input image
im_info = inputs[2].data
# 1. Generate proposals from bbox deltas and shifted anchors
height, width = scores.shape[-2:]
# Enumerate all shifted positions on the (H, W) grid
shift_x = np.arange(0, width) * self._feat_stride
shift_y = np.arange(0, height) * self._feat_stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y, copy=False)
# Convert to (K, 4), K=H*W, where the columns are (dx, dy, dx, dy)
# shift pointing to each grid location
shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
shift_x.ravel(), shift_y.ravel())).transpose()
# Broacast anchors over shifts to enumerate all anchors at all positions
# in the (H, W) grid:
# - add A anchors of shape (1, A, 4) to
# - K shifts of shape (K, 1, 4) to get
# - all shifted anchors of shape (K, A, 4)
# - reshape to (K*A, 4) shifted anchors
num_images = inputs[0].shape[0]
A = self._num_anchors
K = shifts.shape[0]
all_anchors = self._anchors[np.newaxis, :, :] + shifts[:, np.newaxis, :]
all_anchors = all_anchors.reshape((K * A, 4))
rois = np.empty((0, 5), dtype=np.float32)
roi_probs = np.empty((0, 1), dtype=np.float32)
for im_i in range(num_images):
im_i_boxes, im_i_probs = self.proposals_for_one_image(
im_info[im_i, :], all_anchors, bbox_deltas[im_i, :, :, :],
scores[im_i, :, :, :]
)
batch_inds = im_i * np.ones(
(im_i_boxes.shape[0], 1), dtype=np.float32
)
im_i_rois = np.hstack((batch_inds, im_i_boxes))
rois = np.append(rois, im_i_rois, axis=0)
roi_probs = np.append(roi_probs, im_i_probs, axis=0)
outputs[0].reshape(rois.shape)
outputs[0].data[...] = rois
if len(outputs) > 1:
outputs[1].reshape(roi_probs.shape)
outputs[1].data[...] = roi_probs
def proposals_for_one_image(
self, im_info, all_anchors, bbox_deltas, scores
):
# Get mode-dependent configuration
cfg_key = 'TRAIN' if self._train else 'TEST'
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
min_size = cfg[cfg_key].RPN_MIN_SIZE
# Transpose and reshape predicted bbox transformations to get them
# into the same order as the anchors:
# - bbox deltas will be (4 * A, H, W) format from conv output
# - transpose to (H, W, 4 * A)
# - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
# in slowest to fastest order to match the enumerated anchors
bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, 4))
# Same story for the scores:
# - scores are (A, H, W) format from conv output
# - transpose to (H, W, A)
# - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
# to match the order of anchors and bbox_deltas
scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
# 4. sort all (proposal, score) pairs by score from highest to lowest
# 5. take top pre_nms_topN (e.g. 6000)
if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
order = np.argsort(-scores.squeeze())
else:
# Avoid sorting possibly large arrays; First partition to get top K
# unsorted and then sort just those (~20x faster for 200k scores)
inds = np.argpartition(
-scores.squeeze(), pre_nms_topN
)[:pre_nms_topN]
order = np.argsort(-scores[inds].squeeze())
order = inds[order]
bbox_deltas = bbox_deltas[order, :]
all_anchors = all_anchors[order, :]
scores = scores[order]
# Transform anchors into proposals via bbox transformations
proposals = box_utils.bbox_transform(all_anchors, bbox_deltas, self._reg_weights)
# 2. clip proposals to image (may result in proposals with zero area
# that will be removed in the next step)
proposals = box_utils.clip_tiled_boxes(proposals, im_info[:2])
# 3. remove predicted boxes with either height or width < min_size
keep = _filter_boxes(proposals, min_size, im_info)
proposals = proposals[keep, :]
scores = scores[keep]
# 6. apply loose nms (e.g. threshold = 0.7)
# 7. take after_nms_topN (e.g. 300)
# 8. return the top proposals (-> RoIs top)
if nms_thresh > 0:
keep = box_utils.nms(np.hstack((proposals, scores)), nms_thresh)
if post_nms_topN > 0:
keep = keep[:post_nms_topN]
proposals = proposals[keep, :]
scores = scores[keep]
return proposals, scores
def _filter_boxes(boxes, min_size, im_info):
"""Only keep boxes with both sides >= min_size and center within the image.
"""
# Compute the width and height of the proposal boxes as measured in the original
# image coordinate system (this is required to avoid "Negative Areas Found"
# assertions in other parts of the code that measure).
im_scale = im_info[2]
ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
# To avoid numerical issues we require the min_size to be at least 1 pixel in the
# original image
min_size = np.maximum(min_size, 1)
# Proposal center is computed relative to the scaled input image
ws = boxes[:, 2] - boxes[:, 0] + 1
hs = boxes[:, 3] - boxes[:, 1] + 1
x_ctr = boxes[:, 0] + ws / 2.
y_ctr = boxes[:, 1] + hs / 2.
keep = np.where(
(ws_orig_scale >= min_size)
& (hs_orig_scale >= min_size)
& (x_ctr < im_info[1])
& (y_ctr < im_info[0])
)[0]
return keep