首页 > 美文鉴赏

目标检测算法中ROI提取方法比较+源码分析

更新时间:2023-05-12 02:19:59 阅读：评论：0

⽬标检测算法中ROI提取⽅法⽐较+源码分析

本⽂主要介绍ROI提取结构在⽬标检测框架中的作⽤，并结合源码，理解它的实现⽅式。包含的算法有：ROI-pooling，ROI-

align，Deformable-psroi-pooling。

⽬前，主流的⽬标检测算法⼤致分为2种，one-stage和two-stage⽅法。

one-stage：典型代表为SSD，相当于two-stage中的rpn结构，先通过基本的特征提取⽹络如resnet或vggnet得到特征图，再通过5层的卷积得到⽬标位置和类别。这种⽅法计算速度快，但是精度较two-stage⽅法差⼀些

two-stage：典型代表为Faster-rcnn。其结构分为RPN(Region Proposal Network)和RCNN(Region Convolution Neural

Network)两个部分。RPN的特征通过ROI-Pooling层传递到RCNN中。

本⽂介绍的⽅法，仅出现在two-stage的⽅法中。顾名思义，该层的作⽤就是将RPN中提取的位置，截取特征图中特征⽤于进⼀步的分类和定位。

ROI-Pooling

Roi-pooling是Faster-rcnn原版使⽤的特征提取⽅式，。这⾥⽤动图来说明roi-pooling的过程()

从上图看出，ROI-Pooling层的输⼊有两个：RPN层得到的位置和特征提取⽹络得到的特征。参数有pooling结果的宽⾼。

好了，下⾯结合图⽚来理解ROI-pooling代码，代码来源是，由于是cuda代码，所以如果你看了另外⽀持⾃定义operation的框架（如caffe，mxnet等）的roipooling实现⽅式，就会发现它们是完全⼀致的。

代码的注释中，增加了上图实例中各个变量的实际值，以便于读者理解。

_global__ void ROIPoolForward(const int nthreads, const float* bottom_data,

const float spatial_scale, const int height, const int width,

const int channels, const int pooled_height, const int pooled_width,

const float* bottom_rois, float* top_data, int* argmax_data)

{

CUDA_KERNEL_LOOP(index, nthreads)

{

//index是gpu并⾏时块的计数

int pw = index % pooled_width;//pooled_width=2，⽤户设置的参数，控制pooling输出⼤⼩

int ph = (index / pooled_width) % pooled_height;//pooled_height=2，⽤户设置的参数，控制pooling输出⼤⼩

int c = (index / pooled_width / pooled_height) % channels;

int n = index / pooled_width / pooled_height / channels;

// bottom_rois += n * 5;

int roi_batch_ind = bottom_rois[n * 5 + 0];

int roi_start_w = round(bottom_rois[n * 5 + 1] * spatial_scale);//rsw=0，左上⾓点横坐标，来⾃RPN

int roi_start_h = round(bottom_rois[n * 5 + 2] * spatial_scale);//rsh=3，左上⾓点纵坐标，来⾃RPN

int roi_end_w = round(bottom_rois[n * 5 + 3] * spatial_scale);//rew=7，右下⾓点横坐标，来⾃RPN

int roi_end_h = round(bottom_rois[n * 5 + 4] * spatial_scale);//reh=8，右下⾓点纵坐标，来⾃RPN

// Force malformed ROIs to be 1x1

int roi_width = fmaxf(roi_end_w - roi_start_w + 1, 1);

int roi_height = fmaxf(roi_end_h - roi_start_h + 1, 1);

float bin_size_h = (float)(roi_height) / (float)(pooled_height);

float bin_size_w = (float)(roi_width) / (float)(pooled_width);

int hstart = (int)(floor((float)(ph) * bin_size_h));

int wstart = (int)(floor((float)(pw) * bin_size_w));

int hend = (int)(ceil((float)(ph + 1) * bin_size_h));

int wend = (int)(ceil((float)(pw + 1) * bin_size_w));

// Add roi offts and clip to input boundaries

hstart = fminf(fmaxf(hstart + roi_start_h, 0), height);

hend = fminf(fmaxf(hend + roi_start_h, 0), height);

wstart = fminf(fmaxf(wstart + roi_start_w, 0), width);

wend = fminf(fmaxf(wend + roi_start_w, 0), width);

bool is_empty = (hend <= hstart) || (wend <= wstart);//当roi_width<pooled_width或roi_height<pooled_height时触发，此时bin_size<1

// Define an empty pooling region to be zero

float maxval = is_empty ? 0 : -FLT_MAX;

// If nothing is pooled, argmax = -1 caus nothing to be backprop'd

int maxidx = -1;

int bottom_data_batch_offt = roi_batch_ind * channels * height * width;

int bottom_data_offt = bottom_data_batch_offt + c * height * width;

//max-pooling操作，不同的index对应的hstart,wstart,hend,wend不同

for (int h = hstart; h < hend; ++h) {

for (int w = wstart; w < wend; ++w) {

int bottom_index = h * width + w;

if (bottom_data[bottom_data_offt + bottom_index] > maxval) {

maxval = bottom_data[bottom_data_offt + bottom_index];

maxidx = bottom_data_offt + bottom_index;

}

top_data[index] = maxval;

if (argmax_data != NULL)

argmax_data[index] = maxidx;

}

ROI-align

从上⾯ROI-pooling的实现过程不难看出，由于取整的影响，各个index⽅块中对应的宽⾼是不同的，有些是2，有些是3。⽽ROI-align做了⼀个⼩改动，使h, w可以是⼩数，并通过双线性内插取得各个像素值，消除了取整带来的误差。。

此时，特征图上的pooling⽅框变成了下⾯这样

__global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width,

const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) {

CUDA_1D_KERNEL_LOOP(index, nthreads) {

int pw = index % aligned_width;

int ph = (index / aligned_width) % aligned_height;

int c = (index / aligned_width / aligned_height) % channels;

int n = index / aligned_width / aligned_height / channels;

// bottom_rois += n * 5;

float roi_batch_ind = bottom_rois[n * 5 + 0];

float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;

float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;

float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;

float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;

// Force malformed ROIs to be 1x1

float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);

float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);

float bin_size_h = roi_height / (aligned_height - 1.);

float bin_size_w = roi_width / (aligned_width - 1.);

//注意，此处的h,w变成了float型，避免取整带来的误差

float h = (float)(ph) * bin_size_h + roi_start_h;

float w = (float)(pw) * bin_size_w + roi_start_w;

/保留了整数的hstart,hstart，便于下⾯计算取整带来的位置偏移到底是多少

int hstart = fminf(floor(h), height - 2);

int wstart = fminf(floor(w), width - 2);

int img_start = roi_batch_ind * channels * height * width;

// bilinear interpolation

if (h < 0 || h >= height || w < 0 || w >= width) {

top_data[index] = 0.;

} el {

//计算位置偏移，h是float型，hstart是int型

float h_ratio = h - (float)(hstart);

float w_ratio = w - (float)(wstart);

int upleft = img_start + (c * height + hstart) * width + wstart;

int upright = upleft + 1;

int downleft = upleft + width;

int downright = downleft + 1;

//双线性内插

top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)

+ bottom_data[upright] * (1. - h_ratio) * w_ratio

+ bottom_data[downleft] * h_ratio * (1. - w_ratio)

+ bottom_data[downright] * h_ratio * w_ratio;

}

有些细⼼的同学可能发现了上⾯的代码没有进⾏pooling操作，只有内插。那是因为caffe实现roi-align时，在后⾯⼜接了⼀个avg_pooling 和max_pooling，以实现不同的roipooling操作，并且避免了重复开发。

Deformable Roi pooling

可变形roi提取⽅法来源于论⽂,⽂章介绍了形变卷积的⽅法（增加offt）和所带来的好处。其中，Deformable Roi pooling就是⼀种由此衍⽣⽽来的思路。

由于要实现可变形，所以代码中加⼊了offt变量。

template <typename DType>

本文发布于:2023-05-12 02:19:59，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/89/885884.html

上一篇：microRNA和ctDNA诊断上皮性卵巢癌间接比较的Meta分析

下一篇：思想汇报2022年入党积极分子范文(通用10篇)

标签：代码位置取整带来提取特征参数理解

留言与评论（共有 0 条评论）