-
Notifications
You must be signed in to change notification settings - Fork 106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature: Add new operator ml_nms #325
base: master
Are you sure you want to change the base?
Changes from 37 commits
78ba1b7
44a4f4f
11bf820
549175b
5899586
7438d4f
a2ddcea
ee16a1c
edd4c12
393e41e
4a83ca2
8cda3f6
f83a76a
ad2dfbb
487b228
3ee782a
36c5926
69570d7
c6d90ac
ed65307
394f998
01efa33
21cd9aa
a355d0e
6cef39d
565840d
f54a8b6
7ba8626
90fa38e
32191dd
d5d6be6
b826fd2
d5ade70
bffdca6
445333d
be55b03
9436e06
1476296
bb457d7
7d0f804
512d7b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
/************************************************************************* | ||
* Copyright (C) [2022] by Cambricon, Inc. | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a | ||
* copy of this software and associated documentation files (the | ||
* "Software"), to deal in the Software without restriction, including | ||
* without limitation the rights to use, copy, modify, merge, publish, | ||
* distribute, sublicense, and/or sell copies of the Software, and to | ||
* permit persons to whom the Software is furnished to do so, subject to | ||
* the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included | ||
* in all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | ||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | ||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
************************************************************************/ | ||
#include <stdio.h> | ||
#include <string> | ||
#include "core/context.h" | ||
#include "core/gen_case.h" | ||
#include "core/logging.h" | ||
#include "core/runtime/device.h" | ||
#include "core/tensor.h" | ||
#include "core/type.h" | ||
#include "mlu_op_kernel.h" | ||
#include "mlu_op.h" | ||
#include "cnrt.h" | ||
#include "cndev.h" | ||
|
||
static inline bool isSupportType(const mluOpDataType_t check_type, | ||
const mluOpDataType_t support_type[], | ||
const int len) { | ||
for (int i = 0; i < len; ++i) { | ||
if (check_type == support_type[i]) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
mluOpStatus_t MlNmsParamCheck( | ||
const std::string &op_name, const mluOpHandle_t &handle, | ||
const mluOpTensorDescriptor_t &x_desc, const void *x, | ||
const mluOpDataType_t support_type[], const int &len) { | ||
PARAM_CHECK(op_name, x_desc != NULL); | ||
PARAM_CHECK(op_name, handle != NULL); | ||
|
||
// check data type | ||
if (!isSupportType(x_desc->dtype, support_type, len)) { | ||
LOG(ERROR) << op_name << ":x_desc's data type is not supported."; | ||
return MLUOP_STATUS_BAD_PARAM; | ||
} | ||
PARAM_CHECK(op_name, x != NULL); | ||
return MLUOP_STATUS_SUCCESS; | ||
} | ||
|
||
|
||
static void policyFunc(const mluOpHandle_t &handle, | ||
const mluOpTensorDescriptor_t desc, cnrtDim3_t *k_dim, | ||
cnrtFunctionType_t *k_type) { | ||
size_t dim = mluOpGetTensorElementNum(desc); | ||
// Union1 policyFunc | ||
*k_type = CNRT_FUNC_TYPE_UNION1; | ||
k_dim->x = handle->core_num_per_cluster; | ||
k_dim->y = mluop::runtime::getClusterLimitCapability(handle); | ||
k_dim->z = 1; | ||
// if a case is smaller than 2048 , it just need one cluster can work best. | ||
size_t small_case_thread = 2048; | ||
if (dim <= small_case_thread) k_dim->y = 1; | ||
} | ||
|
||
mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, | ||
const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr, | ||
float iou_threshold, void* output_boxes_index) { | ||
|
||
mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT}; | ||
mluOpStatus_t param_check = MlNmsParamCheck( | ||
"[mluOpMlNms]", handle, boxes_data_ptr_desc, boxes_data_ptr, | ||
support_type, 2); | ||
|
||
if (param_check != MLUOP_STATUS_SUCCESS) { | ||
return param_check; | ||
} | ||
|
||
cnrtDim3_t k_dim; | ||
cnrtFunctionType_t k_type; | ||
policyFunc(handle, boxes_data_ptr_desc, &k_dim, &k_type); | ||
int input_boxes_num = boxes_data_ptr_desc->total_element_num / 4; | ||
void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, | ||
cnrtQueue_t queue, mluOpDataType_t data_type, void* boxes_data_ptr, | ||
float nmsThres, int input_boxes_num, uint8_t* output_boxes_index); | ||
|
||
if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF) { | ||
mluOpFuncKernel = mluOpKernelMlNmsHalfFast; | ||
} else { | ||
mluOpFuncKernel = mluOpKernelMlNmsFloatFast; | ||
} | ||
|
||
KERNEL_CHECK( | ||
(mluOpFuncKernel(k_dim, k_type, handle->queue, | ||
boxes_data_ptr_desc->dtype, boxes_data_ptr, | ||
iou_threshold, input_boxes_num, (uint8_t*)output_boxes_index))); | ||
GEN_CASE_END(); | ||
|
||
return MLUOP_STATUS_SUCCESS; | ||
} |
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,230 @@ | ||||||||||||||||||
/************************************************************************* | ||||||||||||||||||
* Copyright (C) [2022] by Cambricon, Inc. | ||||||||||||||||||
* | ||||||||||||||||||
* Permission is hereby granted, free of charge, to any person obtaining a | ||||||||||||||||||
* copy of this software and associated documentation files (the | ||||||||||||||||||
* "Software"), to deal in the Software without restriction, including | ||||||||||||||||||
* without limitation the rights to use, copy, modify, merge, publish, | ||||||||||||||||||
* distribute, sublicense, and/or sell copies of the Software, and to | ||||||||||||||||||
* permit persons to whom the Software is furnished to do so, subject to | ||||||||||||||||||
* the following conditions: | ||||||||||||||||||
* | ||||||||||||||||||
* The above copyright notice and this permission notice shall be included | ||||||||||||||||||
* in all copies or substantial portions of the Software. | ||||||||||||||||||
* | ||||||||||||||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | ||||||||||||||||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||||||||||||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||||||||||||||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | ||||||||||||||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||||||||||||||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||||||||||||||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||||||||||||||||||
************************************************************************/ | ||||||||||||||||||
#include "bang.h" | ||||||||||||||||||
#include "mlu_op_kernel.h" | ||||||||||||||||||
#include "kernels/kernel.h" | ||||||||||||||||||
|
||||||||||||||||||
#define NRAM_SIZE 2 * 1024 | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个宏没有被使用 |
||||||||||||||||||
#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ | ||||||||||||||||||
__mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\ | ||||||||||||||||||
mluOpDataType_t data_type, void* boxes_data_ptr, \ | ||||||||||||||||||
float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ | ||||||||||||||||||
|
||||||||||||||||||
#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ | ||||||||||||||||||
__mlu_global__ void MLUOpKernel##Op##DType##Prefer( \ | ||||||||||||||||||
mluOpDataType_t data_type, void* boxes_data_ptr, \ | ||||||||||||||||||
float nms_thres, int input_boxes_num, uint8_t* output_boxes_index) {\ | ||||||||||||||||||
int offset, seg; \ | ||||||||||||||||||
getOffsetNum##Op##Prefer(input_boxes_num, &offset); \ | ||||||||||||||||||
getSegNumMlNmsFast(input_boxes_num, &seg); \ | ||||||||||||||||||
unionImple<DType, compute##Op##Prefer>( \ | ||||||||||||||||||
(DType*)boxes_data_ptr, (DType)nms_thres, \ | ||||||||||||||||||
offset, seg, input_boxes_num, output_boxes_index);} | ||||||||||||||||||
|
||||||||||||||||||
template <typename T, void (*OpFunc)(T*, T, int, int, int, uint8_t*)> | ||||||||||||||||||
__mlu_device__ void unionImple(T* boxes_data_ptr, T nms_thres, int offset, | ||||||||||||||||||
int seg, int input_boxes_num, uint8_t* output_boxes_index) { | ||||||||||||||||||
__nram__ char worke_space[MAX_NRAM_SIZE / 16]; | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
__memcpy((T*)worke_space, | ||||||||||||||||||
boxes_data_ptr + (offset * 4), | ||||||||||||||||||
seg * 4 * sizeof(T), | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seg是通过所有box的个数在taskDim上进行拆分得到的,这里考虑不严谨(不应该认为计算得到的seg数量能够完全保存在nram上),这里实际需要的是片上单次处理的数据量,应该和内存大小相关。 |
||||||||||||||||||
GDRAM2NRAM); | ||||||||||||||||||
__memcpy((T*)worke_space + (seg * 4), | ||||||||||||||||||
boxes_data_ptr, | ||||||||||||||||||
4 * sizeof(T), | ||||||||||||||||||
GDRAM2NRAM); | ||||||||||||||||||
OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset, | ||||||||||||||||||
seg, output_boxes_index); | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
__mlu_func__ void getComputeLen(int seg, int elem_byte, int* compute_len) { | ||||||||||||||||||
#if (__BANG_ARCH__ < 200) | ||||||||||||||||||
*compute_len = (seg * elem_byte % 64) == 0 ? | ||||||||||||||||||
seg : (seg * elem_byte / 64 + 1) * 64 / elem_byte; | ||||||||||||||||||
#elif (__BANG_ARCH__ > 200 && __BANG_ARCH__ < 300) | ||||||||||||||||||
*compute_len = (seg * elem_byte % 128) == 0 ? | ||||||||||||||||||
seg : (seg * elem_byte / 128 + 1) * 128 / elem_byte; | ||||||||||||||||||
#elif (__BANG_ARCH__ > 300) | ||||||||||||||||||
*compute_len = seg; | ||||||||||||||||||
#endif | ||||||||||||||||||
} | ||||||||||||||||||
__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, int* offset) { | ||||||||||||||||||
if (taskDim > 1) { | ||||||||||||||||||
*offset = (input_boxes_num % taskDim) > taskId ? | ||||||||||||||||||
(input_boxes_num / taskDim + 1) * taskId : | ||||||||||||||||||
(input_boxes_num / taskDim) * taskId + (input_boxes_num % taskDim); | ||||||||||||||||||
} else { | ||||||||||||||||||
*offset = input_boxes_num; | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
__mlu_func__ void getSegNumMlNmsFast(int input_boxes_num, int* seg) { | ||||||||||||||||||
if (taskDim > 1) { | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 因为使用的是U1任务,这里taskDim一定会大于1。建议将该判断去除,默认使用taskDim > 1时的计算。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为后续block任务预留 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 如果taskDim = 1,uint32_t((input_boxes_num % taskDim) > taskId)结果为0,此时*seg通过(input_boxes_num / taskDim) + uint32_t((input_boxes_num % taskDim) > taskId)得到的值和input_boxes_num相同。 |
||||||||||||||||||
*seg = (input_boxes_num / taskDim) + | ||||||||||||||||||
uint32_t((input_boxes_num % taskDim) > taskId); | ||||||||||||||||||
} else { | ||||||||||||||||||
*seg = input_boxes_num; | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
template <typename T> | ||||||||||||||||||
__mlu_func__ void computeMlNmsFast(T* worke_space, | ||||||||||||||||||
T nms_thres, int input_boxes_num, int offset, | ||||||||||||||||||
int seg, uint8_t* output_boxes_index) { | ||||||||||||||||||
__nram__ T scores_max_boxes_area; | ||||||||||||||||||
__nram__ T w_s, h_s; | ||||||||||||||||||
__nram__ T* scores_max_boxes; | ||||||||||||||||||
__nram__ T* x1; | ||||||||||||||||||
__nram__ T* y1; | ||||||||||||||||||
__nram__ T* x2; | ||||||||||||||||||
__nram__ T* y2; | ||||||||||||||||||
__nram__ T* w; | ||||||||||||||||||
__nram__ T* h; | ||||||||||||||||||
__nram__ T* area_ptr; | ||||||||||||||||||
__nram__ T* inter_area_ptr; | ||||||||||||||||||
__nram__ T* scores_max_boxes_area_ptr; | ||||||||||||||||||
__nram__ T* nms_thres_ptr; | ||||||||||||||||||
__nram__ T* scores_max_boxes_ptr; | ||||||||||||||||||
__nram__ T* tem; | ||||||||||||||||||
__nram__ uint8_t* result; | ||||||||||||||||||
int compute_len; | ||||||||||||||||||
int i, j; | ||||||||||||||||||
int data_len = seg * 4 + 4; | ||||||||||||||||||
|
||||||||||||||||||
getComputeLen(seg, sizeof(T), &compute_len); | ||||||||||||||||||
scores_max_boxes = worke_space + (seg * 4); | ||||||||||||||||||
x1 = worke_space + data_len; | ||||||||||||||||||
y1 = worke_space + (data_len + compute_len); | ||||||||||||||||||
x2 = worke_space + (data_len + (compute_len * 2)); | ||||||||||||||||||
y2 = worke_space + (data_len + (compute_len * 3)); | ||||||||||||||||||
Comment on lines
+121
to
+124
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
|
||||||||||||||||||
data_len = data_len + (compute_len * 4); | ||||||||||||||||||
|
||||||||||||||||||
for (i = 0, j = 0; i < seg * 4; i+=4, j++) { | ||||||||||||||||||
x1[j] = worke_space[i]; | ||||||||||||||||||
y1[j] = worke_space[i + 1]; | ||||||||||||||||||
x2[j] = worke_space[i + 2]; | ||||||||||||||||||
y2[j] = worke_space[i + 3]; | ||||||||||||||||||
} | ||||||||||||||||||
w = worke_space + data_len; | ||||||||||||||||||
h = worke_space + (data_len + compute_len); | ||||||||||||||||||
area_ptr = worke_space + (data_len + (compute_len * 2)); | ||||||||||||||||||
inter_area_ptr = worke_space + (data_len + (compute_len * 3)); | ||||||||||||||||||
scores_max_boxes_area_ptr = worke_space + (data_len + (compute_len * 4)); | ||||||||||||||||||
nms_thres_ptr = worke_space + (data_len + (compute_len * 5)); | ||||||||||||||||||
scores_max_boxes_ptr = worke_space + (data_len + (compute_len * 6)); | ||||||||||||||||||
tem = worke_space + (data_len + (compute_len * 7)); | ||||||||||||||||||
result = (uint8_t*)worke_space + (data_len + (compute_len * 8)); | ||||||||||||||||||
|
||||||||||||||||||
// -----------------iou detect-------------------- | ||||||||||||||||||
// fing all boxes area | ||||||||||||||||||
__bang_sub(h, y1, y2, compute_len); | ||||||||||||||||||
__bang_sub(w, x2, x1, compute_len); | ||||||||||||||||||
__bang_mul(area_ptr, h, w, compute_len); | ||||||||||||||||||
Comment on lines
+161
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 面积计算公式为: area = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); /是否少了 + 1操作? |
||||||||||||||||||
|
||||||||||||||||||
// max x1 | ||||||||||||||||||
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[0]); | ||||||||||||||||||
__bang_cycle_sub(x1, x1, scores_max_boxes_ptr, compute_len, compute_len); | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
__bang_active_relu(x1, x1, compute_len); | ||||||||||||||||||
__bang_cycle_add(x1, x1, scores_max_boxes_ptr, compute_len, compute_len); | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里是想要求 x1 和 scores_max_boxes[0] 的最大值的吗?为什么不使用 bang_max(); |
||||||||||||||||||
|
||||||||||||||||||
// min y1 | ||||||||||||||||||
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[1]); | ||||||||||||||||||
__bang_write_zero(tem, compute_len); | ||||||||||||||||||
__bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len); | ||||||||||||||||||
Comment on lines
+172
to
+174
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么不直接使用scores_max_boxes_ptr呢,从后面的代码看该部分的数据会被重写。 |
||||||||||||||||||
__bang_sub(tem, y1, scores_max_boxes_ptr, compute_len); | ||||||||||||||||||
__bang_active_relu(tem, tem, compute_len); | ||||||||||||||||||
__bang_sub(y1, y1, tem, compute_len); | ||||||||||||||||||
|
||||||||||||||||||
// min x2 | ||||||||||||||||||
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[2]); | ||||||||||||||||||
__bang_write_zero(tem, compute_len); | ||||||||||||||||||
__bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len); | ||||||||||||||||||
__bang_sub(tem, x2, scores_max_boxes_ptr, compute_len); | ||||||||||||||||||
__bang_active_relu(tem, tem, compute_len); | ||||||||||||||||||
__bang_sub(x2, x2, tem, compute_len); | ||||||||||||||||||
|
||||||||||||||||||
// max y2 | ||||||||||||||||||
__bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[3]); | ||||||||||||||||||
__bang_cycle_sub(y2, y2, scores_max_boxes_ptr, compute_len, compute_len); | ||||||||||||||||||
__bang_active_relu(y2, y2, compute_len); | ||||||||||||||||||
__bang_cycle_add(y2, y2, scores_max_boxes_ptr, compute_len, compute_len); | ||||||||||||||||||
|
||||||||||||||||||
// --------- intesection------- | ||||||||||||||||||
// fing W | ||||||||||||||||||
__bang_sub(w, x2, x1, compute_len); | ||||||||||||||||||
__bang_active_relu(w, w, compute_len); | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么要求激活呢?如果本来就不想交,两者得到的是负数,这里取绝对值是否存在问题? |
||||||||||||||||||
|
||||||||||||||||||
// find H | ||||||||||||||||||
__bang_sub(h, y1, y2, compute_len); | ||||||||||||||||||
__bang_active_relu(h, h, compute_len); | ||||||||||||||||||
|
||||||||||||||||||
// fing intersection | ||||||||||||||||||
__bang_mul(inter_area_ptr, h, w, compute_len); | ||||||||||||||||||
|
||||||||||||||||||
// fing scores max boxes area | ||||||||||||||||||
w_s = scores_max_boxes[2] - scores_max_boxes[0]; | ||||||||||||||||||
h_s = scores_max_boxes[1] - scores_max_boxes[3]; | ||||||||||||||||||
scores_max_boxes_area = w_s * h_s; | ||||||||||||||||||
|
||||||||||||||||||
__bang_write_value(scores_max_boxes_area_ptr, compute_len, | ||||||||||||||||||
scores_max_boxes_area); | ||||||||||||||||||
__bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, | ||||||||||||||||||
compute_len, compute_len); | ||||||||||||||||||
Comment on lines
+212
to
+213
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
综合代码来看,tem这块内存并不需要,其他位置可以使用scores_max_boxes_area_ptr进行操作,这里可以将结果保存在area_ptr中。 |
||||||||||||||||||
__bang_sub(tem, tem, inter_area_ptr, compute_len); | ||||||||||||||||||
__bang_write_value(nms_thres_ptr, compute_len, nms_thres); | ||||||||||||||||||
__bang_cycle_mul(tem, tem, nms_thres_ptr, compute_len, compute_len); | ||||||||||||||||||
|
||||||||||||||||||
__bang_le(tem, inter_area_ptr, tem, compute_len); | ||||||||||||||||||
|
||||||||||||||||||
for (int i = 0; i < seg; i++) { | ||||||||||||||||||
if (tem[i]) { | ||||||||||||||||||
result[i] = 1; | ||||||||||||||||||
} else { | ||||||||||||||||||
result[i] = 0; | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 该for循环是否可以使用 bang_mul 处理? |
||||||||||||||||||
__memcpy(output_boxes_index + offset, result, seg * sizeof(uint8_t), | ||||||||||||||||||
NRAM2GDRAM); | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); | ||||||||||||||||||
UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); | ||||||||||||||||||
|
||||||||||||||||||
void MLUOP_WIN_API mluOpKernelMlNmsFloatFast( | ||||||||||||||||||
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, | ||||||||||||||||||
mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, | ||||||||||||||||||
int input_boxes_num, uint8_t* output_boxes_index) { | ||||||||||||||||||
MLUOpKernelMlNmsfloatFast<<<k_dim, k_type, queue>>>( | ||||||||||||||||||
data_type, boxes_data_ptr, nms_thres, | ||||||||||||||||||
input_boxes_num, output_boxes_index); | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
void MLUOP_WIN_API mluOpKernelMlNmsHalfFast( | ||||||||||||||||||
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, | ||||||||||||||||||
mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, | ||||||||||||||||||
int input_boxes_num, uint8_t* output_boxes_index) { | ||||||||||||||||||
MLUOpKernelMlNmshalfFast<<<k_dim, k_type, queue>>>( | ||||||||||||||||||
data_type, boxes_data_ptr, nms_thres, | ||||||||||||||||||
input_boxes_num, output_boxes_index); | ||||||||||||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.