Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Add new operator ml_nms #325

Open
wants to merge 41 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
78ba1b7
Create test.txt
wenzhengyin Aug 12, 2022
44a4f4f
Add files via upload
wenzhengyin Aug 12, 2022
11bf820
Add files via upload
wenzhengyin Aug 12, 2022
549175b
Merge branch 'Cambricon:master' into master
wenzhengyin Aug 12, 2022
5899586
Delete unary_op_block.h
wenzhengyin Aug 12, 2022
7438d4f
Delete unary_op_union.h
wenzhengyin Aug 12, 2022
a2ddcea
Delete ml_nms.cpp
wenzhengyin Aug 12, 2022
ee16a1c
Delete ml_nms_block.mlu
wenzhengyin Aug 12, 2022
edd4c12
Delete ml_nms_union.mlu
wenzhengyin Aug 12, 2022
393e41e
Add files via upload
wenzhengyin Aug 12, 2022
4a83ca2
Merge pull request #2 from Jones154/wzy-ml_nms
wenzhengyin Aug 12, 2022
8cda3f6
Merge branch 'Cambricon:master' into master
wenzhengyin Oct 20, 2022
f83a76a
Merge branch 'Cambricon:master' into master
wenzhengyin Oct 25, 2022
ad2dfbb
Merge branch 'Cambricon:master' into master
wenzhengyin Dec 13, 2022
487b228
Update mlu_op.h
wenzhengyin Dec 13, 2022
3ee782a
Update mlu_op_kernel.h
wenzhengyin Dec 13, 2022
36c5926
Update ml_nms.cpp
wenzhengyin Dec 13, 2022
69570d7
Delete ml_nms_block.mlu
wenzhengyin Dec 13, 2022
c6d90ac
Delete ml_nms_union.mlu
wenzhengyin Dec 13, 2022
ed65307
Delete test.txt
wenzhengyin Dec 13, 2022
394f998
Create ml_nms.h
wenzhengyin Dec 13, 2022
01efa33
Create ml_nms.mlu
wenzhengyin Dec 13, 2022
21cd9aa
Update mlu_op_test.proto
wenzhengyin Dec 13, 2022
a355d0e
Create ml_nms.cpp
wenzhengyin Dec 13, 2022
6cef39d
Create ml_nms.h
wenzhengyin Dec 13, 2022
565840d
Create case_0.prototxt
wenzhengyin Dec 13, 2022
f54a8b6
Update ml_nms.cpp
wenzhengyin Dec 13, 2022
7ba8626
Update ml_nms.cpp
wenzhengyin Dec 13, 2022
90fa38e
Update ml_nms.cpp
wenzhengyin Dec 13, 2022
32191dd
Update mlu_op.h
wenzhengyin Dec 13, 2022
d5d6be6
Update ml_nms.h
wenzhengyin Dec 13, 2022
b826fd2
Update mlu_op.h
wenzhengyin Dec 13, 2022
d5ade70
Create ml_nms.cpp
wenzhengyin Dec 13, 2022
bffdca6
Update ml_nms.cpp
wenzhengyin Dec 13, 2022
445333d
Update ml_nms.cpp
wenzhengyin Dec 13, 2022
be55b03
Update ml_nms.mlu
wenzhengyin Dec 13, 2022
9436e06
Delete ml_nms.h
wenzhengyin Dec 13, 2022
1476296
Update mlu_op_kernel.h
wenzhengyin Feb 3, 2023
bb457d7
Update ml_nms.cpp
wenzhengyin Feb 3, 2023
7d0f804
Update ml_nms.mlu
wenzhengyin Feb 3, 2023
512d7b0
Update case_0.prototxt
wenzhengyin Feb 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions bangc-ops/kernels/ml_nms/ml_nms.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*************************************************************************
* Copyright (C) [2022] by Cambricon, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
************************************************************************/
#include <stdio.h>
#include <string>
#include "core/context.h"
#include "core/gen_case.h"
#include "core/logging.h"
#include "core/runtime/device.h"
#include "core/tensor.h"
#include "core/type.h"
#include "kernels/kernel.h"
#include "mlu_op_kernel.h"
#include "mlu_op.h"
#include "cnrt.h"
#include "cndev.h"

static inline bool isSupportType(const mluOpDataType_t check_type,
const mluOpDataType_t support_type[],
const int len) {
for (int i = 0; i < len; ++i) {
if (check_type == support_type[i]) {
return true;
}
}
return false;
}

mluOpStatus_t MlNmsParamCheck(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
mluOpStatus_t MlNmsParamCheck(
mluOpStatus_t mlNmsParamCheck(

const std::string &op_name, const mluOpHandle_t &handle,
const mluOpTensorDescriptor_t &x_desc, const void *x,
const mluOpDataType_t support_type[], const int &len) {
PARAM_CHECK(op_name, x_desc != NULL);
PARAM_CHECK(op_name, handle != NULL);

// check data type
if (!isSupportType(x_desc->dtype, support_type, len)) {
LOG(ERROR) << op_name << ":x_desc's data type is not supported.";
return MLUOP_STATUS_BAD_PARAM;
}
PARAM_CHECK(op_name, x != NULL);
return MLUOP_STATUS_SUCCESS;
}


static void policyFunc(const mluOpHandle_t &handle,
const mluOpTensorDescriptor_t desc, cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type) {
size_t dim = mluOpGetTensorElementNum(desc);
// Union1 policyFunc
*k_type = CNRT_FUNC_TYPE_UNION1;
k_dim->x = handle->core_num_per_cluster;
k_dim->y = mluop::runtime::getClusterLimitCapability(handle);
k_dim->z = 1;
// if a case is smaller than 2048 , it just need one cluster can work best.
size_t small_case_thread = 2048;
if (dim <= small_case_thread) k_dim->y = 1;
}

mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle,
const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr,
float iou_threshold, void* output_boxes_index) {

mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT};
mluOpStatus_t param_check = MlNmsParamCheck(
"[mluOpMlNms]", handle, boxes_data_ptr_desc, boxes_data_ptr,
support_type, 2);

if (param_check != MLUOP_STATUS_SUCCESS) {
return param_check;
}

cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(handle, boxes_data_ptr_desc, &k_dim, &k_type);
int input_boxes_num = boxes_data_ptr_desc->total_element_num / 6;
int apply_nram_size = 0;
int boxes_start_position = 0;
int loop_num = 0;
void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, mluOpDataType_t data_type, void* boxes_data_ptr,
float nmsThres, int input_boxes_num, int boxes_start_position,
uint8_t* output_boxes_index);

if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF) {
mluOpFuncKernel = mluOpKernelMlNmsHalfFast;
apply_nram_size = (input_boxes_num * 6 * 2) + (input_boxes_num * 14 * 2);
} else {
mluOpFuncKernel = mluOpKernelMlNmsFloatFast;
apply_nram_size = (input_boxes_num * 6 * 4) + (input_boxes_num * 14 * 4);
}
if (apply_nram_size > MAX_NRAM_SIZE) {
if ((apply_nram_size % MAX_NRAM_SIZE) !=0) {
loop_num = (apply_nram_size / MAX_NRAM_SIZE) + 1;
} else {
loop_num = apply_nram_size / MAX_NRAM_SIZE;
}
}
if (loop_num > 0) {
Comment on lines +104 to +118
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这部分循环放在 kernel 内(mluOpFuncKernel)处理更合适,减少kernl launch的开销

for (int i = 0; i < loop_num; i++) {
boxes_start_position = i * (input_boxes_num / loop_num);
KERNEL_CHECK((mluOpFuncKernel(k_dim, k_type, handle->queue,
boxes_data_ptr_desc->dtype,
boxes_data_ptr,
iou_threshold,
input_boxes_num,
boxes_start_position,
(uint8_t*)output_boxes_index)));
}
} else {
KERNEL_CHECK((mluOpFuncKernel(k_dim, k_type, handle->queue,
boxes_data_ptr_desc->dtype,
boxes_data_ptr,
iou_threshold,
input_boxes_num,
boxes_start_position,
(uint8_t*)output_boxes_index)));
}
GEN_CASE_END();

return MLUOP_STATUS_SUCCESS;
}
252 changes: 252 additions & 0 deletions bangc-ops/kernels/ml_nms/ml_nms.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
/*************************************************************************
* Copyright (C) [2022] by Cambricon, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
************************************************************************/
#include "bang.h"
#include "mlu_op_kernel.h"
#include "kernels/kernel.h"

#define NRAM_SIZE 2 * 1024
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个宏没有被使用

#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \
__mlu_global__ void MLUBlockKernel##Op##DType##Prefer( \
mluOpDataType_t data_type, void* boxes_data_ptr, \
float nms_thres, int input_boxes_num, int boxes_start_position, \
uint8_t* output_boxes_index); \

#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \
__mlu_global__ void MLUOpKernel##Op##DType##Prefer( \
mluOpDataType_t data_type, void* boxes_data_ptr, \
float nms_thres, int input_boxes_num, int boxes_start_position, \
uint8_t* output_boxes_index) { \
int offset, seg; \
getOffsetNum##Op##Prefer(input_boxes_num, &offset); \
getSegNumMlNmsFast(input_boxes_num, &seg); \
unionImple<DType, compute##Op##Prefer>( \
(DType*)boxes_data_ptr, (DType)nms_thres, \
offset, seg, input_boxes_num, boxes_start_position, output_boxes_index);}

template <typename T, void (*OpFunc)(T*, T, int, int, int, uint8_t*)>
__mlu_device__ void unionImple(T* boxes_data_ptr,
T nms_thres, int offset, int seg, int input_boxes_num,
int boxes_start_position, uint8_t* output_boxes_index) {
__nram__ char worke_space[MAX_NRAM_SIZE / 16];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
__nram__ char worke_space[MAX_NRAM_SIZE / 16];
__nram__ char work_space[MAX_NRAM_SIZE / 16];

__memcpy((T*)worke_space,
boxes_data_ptr + ((boxes_start_position + offset) * 6),
seg * 6 * sizeof(T),
GDRAM2NRAM);
__memcpy((T*)worke_space + (seg * 6),
boxes_data_ptr,
6 * sizeof(T),
GDRAM2NRAM);
OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset,
seg, output_boxes_index);
}

__mlu_func__ void getComputeLen(int seg, int elem_byte, int* compute_len) {
#if (__BANG_ARCH__ < 200)
*compute_len = (seg * elem_byte % 64) == 0 ?
seg : (seg * elem_byte / 64 + 1) * 64 / elem_byte;
#elif (__BANG_ARCH__ > 200 && __BANG_ARCH__ < 300)
*compute_len = (seg * elem_byte % 128) == 0 ?
seg : (seg * elem_byte / 128 + 1) * 128 / elem_byte;
#elif (__BANG_ARCH__ > 300)
*compute_len = seg;
#endif
}
__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, int* offset) {
if (taskDim > 1) {
*offset = (input_boxes_num % taskDim) > taskId ?
(input_boxes_num / taskDim + 1) * taskId :
(input_boxes_num / taskDim) * taskId + (input_boxes_num % taskDim);
} else {
*offset = input_boxes_num;
}
}

__mlu_func__ void getSegNumMlNmsFast(int input_boxes_num, int* seg) {
if (taskDim > 1) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

因为使用的是U1任务,这里taskDim一定会大于1。建议将该判断去除,默认使用taskDim > 1时的计算。

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为后续block任务预留

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果taskDim = 1,uint32_t((input_boxes_num % taskDim) > taskId)结果为0,此时*seg通过(input_boxes_num / taskDim) + uint32_t((input_boxes_num % taskDim) > taskId)得到的值和input_boxes_num相同。

*seg = (input_boxes_num / taskDim) +
uint32_t((input_boxes_num % taskDim) > taskId);
} else {
*seg = input_boxes_num;
}
}

template <typename T>
__mlu_func__ void computeMlNmsFast(T* worke_space,
T nms_thres, int input_boxes_num, int offset,
int seg, uint8_t* output_boxes_index) {
__nram__ T scores_max_boxes_area;
__nram__ T w_s, h_s;
__nram__ T* scores_max_boxes;
__nram__ T* x1;
__nram__ T* y1;
__nram__ T* x2;
__nram__ T* y2;
__nram__ T* w;
__nram__ T* h;
__nram__ T* area_ptr;
__nram__ T* inter_area_ptr;
__nram__ T* scores_max_boxes_area_ptr;
__nram__ T* nms_thres_ptr;
__nram__ T* scores_max_boxes_ptr;
__nram__ T* tem;
__nram__ uint8_t* similar_index;
__nram__ uint8_t* result;
int compute_len;
int i, j;
int data_len = seg * 6 + 6;

// ----------------------allocate memory---------------------
getComputeLen(seg, sizeof(T), &compute_len);
scores_max_boxes = worke_space + (seg * 6);
x1 = worke_space + data_len;
y1 = worke_space + (data_len + compute_len);
x2 = worke_space + (data_len + (compute_len * 2));
y2 = worke_space + (data_len + (compute_len * 3));
Comment on lines +121 to +124
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
x1 = worke_space + data_len;
y1 = worke_space + (data_len + compute_len);
x2 = worke_space + (data_len + (compute_len * 2));
y2 = worke_space + (data_len + (compute_len * 3));
x1 = worke_space + data_len;
y1 = x1 + compute_len;
x2 = y1 + compute_len;
y2 = x2 + compute_len;

data_len = data_len + (compute_len * 4);
w = worke_space + data_len;
h = worke_space + (data_len + compute_len);
area_ptr = worke_space + (data_len + (compute_len * 2));
inter_area_ptr = worke_space + (data_len + (compute_len * 3));
scores_max_boxes_area_ptr = worke_space + (data_len + (compute_len * 4));
nms_thres_ptr = worke_space + (data_len + (compute_len * 5));
scores_max_boxes_ptr = worke_space + (data_len + (compute_len * 6));
tem = worke_space + (data_len + (compute_len * 7));
if (sizeof(T) == sizeof(uint8_t)) {
similar_index = (uint8_t*)worke_space + (data_len + (compute_len * 8));
result = (uint8_t*)worke_space + (data_len + (compute_len * 8) + seg);
} else {
similar_index = (uint8_t*)worke_space + ((data_len + (compute_len * 8)) *
(sizeof(T) / sizeof(uint8_t)));
result = (uint8_t*)worke_space + ((data_len + (compute_len * 8)) *
(sizeof(T) / sizeof(uint8_t)) + seg);
}
for (i = 0, j = 0; i < seg * 6; i+=6, j++) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (i = 0, j = 0; i < seg * 6; i+=6, j++) {
for (i = 0, j = 0; i < seg * 6; i += 6, j++) {

if (*(scores_max_boxes + 5) == worke_space[i + 5]) {
similar_index[j] = 1;
x1[j] = worke_space[i];
y1[j] = worke_space[i + 1];
x2[j] = worke_space[i + 2];
y2[j] = worke_space[i + 3];
} else {
similar_index[j] = 0;
x1[j] = 0.0;
y1[j] = 0.0;
x2[j] = 0.0;
y2[j] = 0.0;
}
}

// -----------------iou detect--------------------
// fing all boxes area
__bang_sub(h, y1, y2, compute_len);
__bang_sub(w, x2, x1, compute_len);
__bang_mul(area_ptr, h, w, compute_len);
Comment on lines +161 to +163
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

面积计算公式为:

area = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 

/是否少了 + 1操作?


// max x1
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[0]);
__bang_cycle_sub(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
__bang_cycle_sub(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
__bang_sub(x1, x1, scores_max_boxes_ptr);

__bang_active_relu(x1, x1, compute_len);
__bang_cycle_add(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
__bang_cycle_add(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
__bang_add(x1, x1, scores_max_boxes_ptr);

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里是想要求 x1 和 scores_max_boxes[0] 的最大值的吗?为什么不使用 bang_max();
| x1 - scores_max_boxes[0] | + scores_max_boxes[0],如果x1小,得到的是 2 * scores_max_boxes[0] - x1;这里是否存在问题?


// min y1
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[1]);
__bang_write_zero(tem, compute_len);
__bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len);
Comment on lines +172 to +174
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么不直接使用scores_max_boxes_ptr呢,从后面的代码看该部分的数据会被重写。

__bang_sub(tem, y1, scores_max_boxes_ptr, compute_len);
__bang_active_relu(tem, tem, compute_len);
__bang_sub(y1, y1, tem, compute_len);

// min x2
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[2]);
__bang_write_zero(tem, compute_len);
__bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len);
__bang_sub(tem, x2, scores_max_boxes_ptr, compute_len);
__bang_active_relu(tem, tem, compute_len);
__bang_sub(x2, x2, tem, compute_len);

// max y2
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[3]);
__bang_cycle_sub(y2, y2, scores_max_boxes_ptr, compute_len, compute_len);
__bang_active_relu(y2, y2, compute_len);
__bang_cycle_add(y2, y2, scores_max_boxes_ptr, compute_len, compute_len);

// --------- intesection-------
// fing W
__bang_sub(w, x2, x1, compute_len);
__bang_active_relu(w, w, compute_len);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么要求激活呢?如果本来就不想交,两者得到的是负数,这里取绝对值是否存在问题?


// find H
__bang_sub(h, y1, y2, compute_len);
__bang_active_relu(h, h, compute_len);

// fing intersection
__bang_mul(inter_area_ptr, h, w, compute_len);

// fing scores max boxes area
w_s = scores_max_boxes[2] - scores_max_boxes[0];
h_s = scores_max_boxes[1] - scores_max_boxes[3];
scores_max_boxes_area = w_s * h_s;

__bang_write_value(scores_max_boxes_area_ptr, compute_len,
scores_max_boxes_area);
__bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr,
compute_len, compute_len);
Comment on lines +212 to +213
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
__bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr,
compute_len, compute_len);
__bang_add(tem, area_ptr, scores_max_boxes_area_ptr);

综合代码来看,tem这块内存并不需要,其他位置可以使用scores_max_boxes_area_ptr进行操作,这里可以将结果保存在area_ptr中。

__bang_sub(tem, tem, inter_area_ptr, compute_len);
__bang_write_value(nms_thres_ptr, compute_len, nms_thres);
__bang_cycle_mul(tem, tem, nms_thres_ptr, compute_len, compute_len);

__bang_le(tem, inter_area_ptr, tem, compute_len);

for (int i = 0; i < seg; i++) {
if (tem[i] && similar_index[i]) {
result[i] = 1;
} else {
result[i] = 0;
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

该for循环是否可以使用 bang_mul 处理?

__memcpy(output_boxes_index + offset, result, seg * sizeof(uint8_t),
NRAM2GDRAM);
}

UNION_OP_KERNEL_IMPLE(MlNms, float, Fast);
UNION_OP_KERNEL_IMPLE(MlNms, half, Fast);

void MLUOP_WIN_API mluOpKernelMlNmsFloatFast(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t data_type, void* boxes_data_ptr,
float nms_thres, int input_boxes_num, int boxes_start_position,
uint8_t* output_boxes_index) {
MLUOpKernelMlNmsfloatFast<<<k_dim, k_type, queue>>>(
data_type, boxes_data_ptr, nms_thres,
input_boxes_num, boxes_start_position, output_boxes_index);
}

void MLUOP_WIN_API mluOpKernelMlNmsHalfFast(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
mluOpDataType_t data_type, void* boxes_data_ptr,
float nms_thres, int input_boxes_num, int boxes_start_position,
uint8_t* output_boxes_index) {
MLUOpKernelMlNmshalfFast<<<k_dim, k_type, queue>>>(
data_type, boxes_data_ptr, nms_thres,
input_boxes_num, boxes_start_position, output_boxes_index);
}
Loading