From 78ba1b7a87af69b5060488379c9a0a19d4c0e276 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 14:57:27 +0800 Subject: [PATCH 01/36] Create test.txt --- bangc-ops/kernels/ml_nms/test.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 bangc-ops/kernels/ml_nms/test.txt diff --git a/bangc-ops/kernels/ml_nms/test.txt b/bangc-ops/kernels/ml_nms/test.txt new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/bangc-ops/kernels/ml_nms/test.txt @@ -0,0 +1 @@ + From 44a4f4fad3583c21799e0e28b7f4de11abd99db6 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 14:58:00 +0800 Subject: [PATCH 02/36] Add files via upload --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 74 ++++++++ bangc-ops/kernels/ml_nms/ml_nms_block.mlu | 166 ++++++++++++++++++ bangc-ops/kernels/ml_nms/ml_nms_union.mlu | 201 ++++++++++++++++++++++ 3 files changed, 441 insertions(+) create mode 100644 bangc-ops/kernels/ml_nms/ml_nms.cpp create mode 100644 bangc-ops/kernels/ml_nms/ml_nms_block.mlu create mode 100644 bangc-ops/kernels/ml_nms/ml_nms_union.mlu diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp new file mode 100644 index 000000000..f59972a6a --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -0,0 +1,74 @@ +/************************************************************************* + > File Name: main.cpp + > Author: wenzhengyin + > Mail: jones980116@163.com + > Created Time: Tue Apr 19 14:35:06 2022 + ************************************************************************/ +#include "cnrt.h" +#include "cndev.h" +//#include "cnrt_data.h" +#include +#include +#include +#include +#include +#include +#include +#include "core/mlu_op_core.h" +#include "core/context.h" +#include "core/gen_case.h" +#include "core/logging.h" +#include "core/runtime/device.h" +#include "core/tensor.h" +#include "core/type.h" +#include "mlu_op_kernel.h" +#include "kernels/unary_op/unary_op_host.h" +using namespace std; +typedef uint16_t half; + + +mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, + const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, + int input_boxes_num, float iou_threshold, uint8_t* output_boxes_index) { + + int setoff; + bool zero_element = false; + mluOpDataType_t data_type = MLUOP_DTYPE_HALF; + mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT}; + + cnrtDim3_t k_dim = {4, 1, 1}; + cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + + mluOpStatus_t param_check = unaryOpNmsParamCheck( + "[mluOpMlNms]", boxes_data_ptr_desc, boxes_data_ptr, scores_max_boxes_data_ptr, support_type, 2, zero_element); + if(param_check != MLUOP_STATUS_SUCCESS){ + return param_check; + } + + void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, + void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nmsThres, int input_boxes_num, uint8_t* output_boxes_index); + //mluOpFuncKernel = NULL; + if (k_type == CNRT_FUNC_TYPE_BLOCK){ + if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ + mluOpFuncKernel = mluBlockKernelMlNmsHalfFast; + }else { + mluOpFuncKernel = mluBlockKernelMlNmsFloatFast; + } + }else { + if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ + mluOpFuncKernel = mluUnionKernelMlNmsHalfFast; + }else{ + mluOpFuncKernel = mluUnionKernelMlNmsFloatFast; + } + } + + KERNEL_CHECK( + (mluOpFuncKernel(k_dim, k_type, handle->queue, boxes_data_ptr_desc->dtype, boxes_data_ptr, scores_max_boxes_data_ptr, iou_threshold, input_boxes_num, output_boxes_index))); + GEN_CASE_END(); + + return MLUOP_STATUS_SUCCESS; + +} + + + diff --git a/bangc-ops/kernels/ml_nms/ml_nms_block.mlu b/bangc-ops/kernels/ml_nms/ml_nms_block.mlu new file mode 100644 index 000000000..be80bd2fc --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms_block.mlu @@ -0,0 +1,166 @@ +#include "bang.h" +#include "mlu_op_kernel.h" +#include "kernels/unary_op/unary_op_block.h" + +#define MLU_VERSION 270 +#if MLU_VERSION == 220 +#undef MLU_VERSION +#define MLU_VERSION 220 +#elif MLU_VERSION == 270 +#undef MLU_VERSION +#define MLU_VERSION 270 +#elif MLU_VERSION == 290 +#undef MLU_VERSION +#define MLU_VERSION 290 +#elif MLU_VERSION == 322 +#undef MLU_VERSION +#define MLU_VERSION 322 +#elif MLU_VERSION == 372 +#undef MLU_VERSION +#define MLU_VERSION 372 +#endif + +__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int* setoff){ + + switch (data_type){ + case MLUOP_DTYPE_HALF: + if((input_boxes_num % (min_cell / 2)) != 0){ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2) + 1); + }else{ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2)); + } + case MLUOP_DTYPE_FLOAT: + if ((input_boxes_num % (min_cell / 4)) != 0){ + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4) + 1); + } else { + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4)); + } + default:break; + } +} + +__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff){ + + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, setoff); + } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, setoff); + } +} + + + +template +__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ + + __nram__ T scores_max_boxes[4]; + __nram__ T scores_max_boxes_area; + __nram__ T boxes_data[512]; + __nram__ T x1[512]; + __nram__ T y1[512]; + __nram__ T x2[512]; + __nram__ T y2[512]; + __nram__ T w[512]; + __nram__ T h[512]; + __nram__ T area_ptr[512]; + __nram__ T interarea_ptr[512]; + __nram__ T scores_max_boxes_area_ptr[512]; + __nram__ T nms_thresPtr[512]; + __nram__ T scores_max_boxes_ptr[512]; + __nram__ T tem[512]; + __nram__ uint8_t result[512]; + + __memcpy(boxes_data, boxes_data_ptr, input_boxes_num * 4 * sizeof(T), GDRAM2NRAM); + __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); + + int j,i; + for(i = 0, j = 0; i < setoff; i++, j+=4){ + x1[i] = boxes_data[j + 0]; + y1[i] = boxes_data[j + 1]; + x2[i] = boxes_data[j + 2]; + y2[i] = boxes_data[j + 3]; + } + + + //-----------------iou detect-------------------- + + //fing all boxes area + __bang_sub(h, y1, y2, setoff); + __bang_sub(w, x2, x1, setoff); + __bang_mul(area_ptr, h, w, setoff); + + //max x1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); + __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(x1, x1, setoff); + __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); + + //min y1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(y1, y1, tem, setoff); + + //min x2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(x2, x2, tem, setoff); + + //max y2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); + __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(y2, y2, setoff); + __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); + + //--------- intesection------- + //fing W + __bang_sub(w, x2, x1, setoff); + __bang_active_relu(w, w, setoff); + + //find H + __bang_sub(h, y1, y2, setoff); + __bang_active_relu(h, h, setoff); + + //fing intersection + __bang_mul(interarea_ptr, h, w, setoff); + + //fing scores max boxes area + scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); + __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); + __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); + __bang_sub(tem, tem, interarea_ptr, setoff); + __bang_write_value(nms_thresPtr, setoff, nms_thres); + __bang_cycle_mul(tem, tem, nms_thresPtr, setoff, setoff); + __bang_ge(tem, interarea_ptr, tem, setoff); + + for(int i =0; i < setoff; i++){ + result[i] = (int)(tem[i]); + } + + __memcpy(output_boxes_index, result, input_boxes_num * sizeof(uint8_t), NRAM2GDRAM); +} + + +UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); +UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); + +void MLUOP_WIN_API mluBlockKernelMlNmsFloatFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUBlockKernelMlNmsfloatFast<<>>( + data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} + +void MLUOP_WIN_API mluBlockKernelMlNmsHalfFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUBlockKernelMlNmshalfFast<<>>( + data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} diff --git a/bangc-ops/kernels/ml_nms/ml_nms_union.mlu b/bangc-ops/kernels/ml_nms/ml_nms_union.mlu new file mode 100644 index 000000000..cc878d2e3 --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms_union.mlu @@ -0,0 +1,201 @@ +#include "bang.h" +#include "mlu_op_kernel.h" +#include "kernels/unary_op/unary_op_union.h" + +#define MLU_VERSION 270 +#if MLU_VERSION == 220 +#undef MLU_VERSION +#define MLU_VERSION 220 +#elif MLU_VERSION == 270 +#undef MLU_VERSION +#define MLU_VERSION 270 +#elif MLU_VERSION == 290 +#undef MLU_VERSION +#define MLU_VERSION 290 +#elif MLU_VERSION == 322 +#undef MLU_VERSION +#define MLU_VERSION 322 +#elif MLU_VERSION == 372 +#undef MLU_VERSION +#define MLU_VERSION 372 +#endif + +__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int cord_num, int* setoff){ + switch (data_type){ + case MLUOP_DTYPE_HALF: + if((input_boxes_num % (min_cell / 2 * cord_num)) != 0){ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num) + 1); + }else{ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num)); + } + case MLUOP_DTYPE_FLOAT: + if ((input_boxes_num % (min_cell / 4 * cord_num)) != 0){ + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num) + 1); + } else { + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num)); + } + default:break; + } +} + +__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff, cnrtFunctionType_t k_type){ + if(k_type == CNRT_FUNC_TYPE_UNION1){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 4, setoff); + } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, 4, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, 4, setoff); + } + } else if(k_type == CNRT_FUNC_TYPE_UNION2){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 8, setoff); + } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, 8, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, 8, setoff); + } + } else if(k_type == CNRT_FUNC_TYPE_UNION4){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 16, setoff); + } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, 16, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, 16, setoff); + } + } else if(k_type == CNRT_FUNC_TYPE_UNION8){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 32, setoff); + } + } +} + + + +template +__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ + + __nram__ T scores_max_boxes[4]; + __nram__ T scores_max_boxes_area; + __nram__ T boxes_data[512]; + __nram__ T x1[512]; + __nram__ T y1[512]; + __nram__ T x2[512]; + __nram__ T y2[512]; + __nram__ T w[512]; + __nram__ T h[512]; + __nram__ T area_ptr[512]; + __nram__ T interarea_ptr[512]; + __nram__ T scores_max_boxes_area_ptr[512]; + __nram__ T nms_thres_ptr[512]; + __nram__ T scores_max_boxes_ptr[512]; + __nram__ T tem[512]; + __nram__ uint8_t result[512]; + + if (input_boxes_num % taskDim != 0){ + __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); + } else { + if (taskId == (taskDim - 1)){ + __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), ((input_boxes_num / taskDim) + (input_boxes_num % taskDim)) * 4 * sizeof(T), GDRAM2NRAM); + } else { + __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); + } + } + __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); + + int j,i; + for(i = 0, j = 0; i < setoff; i++, j+=4){ + x1[i] = boxes_data[j + 0]; + y1[i] = boxes_data[j + 1]; + x2[i] = boxes_data[j + 2]; + y2[i] = boxes_data[j + 3]; + } + + //-----------------iou detect-------------------- + //fing all boxes area + __bang_sub(h, y1, y2, setoff); + __bang_sub(w, x2, x1, setoff); + __bang_mul(area_ptr, h, w, setoff); + + //max x1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); + __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(x1, x1, setoff); + __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); + + //min y1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(y1, y1, tem, setoff); + + //min x2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(x2, x2, tem, setoff); + + //max y2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); + __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(y2, y2, setoff); + __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); + + //--------- intesection------- + //fing W + __bang_sub(w, x2, x1, setoff); + __bang_active_relu(w, w, setoff); + + //find H + __bang_sub(h, y1, y2, setoff); + __bang_active_relu(h, h, setoff); + + //fing intersection + __bang_mul(interarea_ptr, h, w, setoff); + + //fing scores max boxes area + scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); + __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); + __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); + __bang_sub(tem, tem, interarea_ptr, setoff); + __bang_write_value(nms_thres_ptr, setoff, nms_thres); + __bang_cycle_mul(tem, tem, nms_thres_ptr, setoff, setoff); + __bang_gt(tem, interarea_ptr, tem, setoff); + + for(int i =0; i < setoff; i++){ + result[i] = (int)(tem[i]); + } + + if (input_boxes_num % taskDim !=0){ + if (taskId == (taskDim - 1)){ + __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim + input_boxes_num % taskDim) * sizeof(uint8_t), NRAM2GDRAM); + } else { + __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); + } + } else { + __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); + } + +} + + +UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); +UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); + +void MLUOP_WIN_API mluUnionKernelMlNmsFloatFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUUnionKernelMlNmsfloatFast<<>>( + k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} + +void MLUOP_WIN_API mluUnionKernelMlNmsHalfFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUUnionKernelMlNmshalfFast<<>>( + k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} From 11bf820389ff110523f320fccce84e41b838039b Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 14:58:58 +0800 Subject: [PATCH 03/36] Add files via upload --- bangc-ops/kernels/unary_op/unary_op_block.h | 28 +++++++++++++++++++++ bangc-ops/kernels/unary_op/unary_op_union.h | 28 +++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 bangc-ops/kernels/unary_op/unary_op_block.h create mode 100644 bangc-ops/kernels/unary_op/unary_op_union.h diff --git a/bangc-ops/kernels/unary_op/unary_op_block.h b/bangc-ops/kernels/unary_op/unary_op_block.h new file mode 100644 index 000000000..9314775eb --- /dev/null +++ b/bangc-ops/kernels/unary_op/unary_op_block.h @@ -0,0 +1,28 @@ +/************************************************************************* + > File Name: union_task.h + > Author: wenzhengyin + > Mail: jones980116@163.com + > Created Time: Sun Jun 19 18:58:04 2022 + ************************************************************************/ +#ifndef UNARY_OP_BLOCK_H_ +#define UNARY_OP_BLOCK_H_ + +#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ + __mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\ + mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ + +#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ + __mlu_global__ void MLUBlockKernel##Op##DType##Prefer( \ + mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){\ + int setoff;\ + getOffsetNum##Op##Prefer(input_boxes_num, data_type, &setoff); \ + unionImple(\ + (DType*)boxes_data_ptr, (DType*)boxes_scores_ptr, (DType)nms_thres, setoff, input_boxes_num, output_boxes_index);} + +template +__mlu_device__ void unionImple(T* boxes_data_ptr, T* boxes_scores_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ + + OpFunc(boxes_data_ptr, boxes_scores_ptr, nms_thres, setoff, input_boxes_num, output_boxes_index); +} + +#endif diff --git a/bangc-ops/kernels/unary_op/unary_op_union.h b/bangc-ops/kernels/unary_op/unary_op_union.h new file mode 100644 index 000000000..302694520 --- /dev/null +++ b/bangc-ops/kernels/unary_op/unary_op_union.h @@ -0,0 +1,28 @@ +/************************************************************************* + > File Name: union_task.h + > Author: wenzhengyin + > Mail: jones980116@163.com + > Created Time: Sun Jun 19 18:58:04 2022 + ************************************************************************/ +#ifndef UNARY_OP_UNION_H_ +#define UNARY_OP_UNION_H_ + +#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ + __mlu_global__ void MLUUnionKernel##Op##DType##Prefer(\ + cnrtFunctionType_t k_type, mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ + +#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ + __mlu_global__ void MLUUnionKernel##Op##DType##Prefer( \ + cnrtFunctionType_t k_type, mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){\ + int setoff;\ + getOffsetNum##Op##Prefer(input_boxes_num, data_type, &setoff, k_type); \ + unionImple(\ + (DType*)boxes_data_ptr, (DType*)boxes_scores_ptr, (DType)nms_thres, setoff, input_boxes_num, output_boxes_index);} + +template +__mlu_device__ void unionImple(T* boxes_data_ptr, T* boxes_scores_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ + + OpFunc(boxes_data_ptr, boxes_scores_ptr, nms_thres, setoff, input_boxes_num, output_boxes_index); +} + +#endif From 58995867784311c4b0dcdcf13165ce2566531f03 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 16:36:54 +0800 Subject: [PATCH 04/36] Delete unary_op_block.h --- bangc-ops/kernels/unary_op/unary_op_block.h | 28 --------------------- 1 file changed, 28 deletions(-) delete mode 100644 bangc-ops/kernels/unary_op/unary_op_block.h diff --git a/bangc-ops/kernels/unary_op/unary_op_block.h b/bangc-ops/kernels/unary_op/unary_op_block.h deleted file mode 100644 index 9314775eb..000000000 --- a/bangc-ops/kernels/unary_op/unary_op_block.h +++ /dev/null @@ -1,28 +0,0 @@ -/************************************************************************* - > File Name: union_task.h - > Author: wenzhengyin - > Mail: jones980116@163.com - > Created Time: Sun Jun 19 18:58:04 2022 - ************************************************************************/ -#ifndef UNARY_OP_BLOCK_H_ -#define UNARY_OP_BLOCK_H_ - -#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ - __mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\ - mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ - -#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ - __mlu_global__ void MLUBlockKernel##Op##DType##Prefer( \ - mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){\ - int setoff;\ - getOffsetNum##Op##Prefer(input_boxes_num, data_type, &setoff); \ - unionImple(\ - (DType*)boxes_data_ptr, (DType*)boxes_scores_ptr, (DType)nms_thres, setoff, input_boxes_num, output_boxes_index);} - -template -__mlu_device__ void unionImple(T* boxes_data_ptr, T* boxes_scores_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ - - OpFunc(boxes_data_ptr, boxes_scores_ptr, nms_thres, setoff, input_boxes_num, output_boxes_index); -} - -#endif From 7438d4f8d3fa6ec5cbf2af2385c5527e2c17b5d3 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 16:37:13 +0800 Subject: [PATCH 05/36] Delete unary_op_union.h --- bangc-ops/kernels/unary_op/unary_op_union.h | 28 --------------------- 1 file changed, 28 deletions(-) delete mode 100644 bangc-ops/kernels/unary_op/unary_op_union.h diff --git a/bangc-ops/kernels/unary_op/unary_op_union.h b/bangc-ops/kernels/unary_op/unary_op_union.h deleted file mode 100644 index 302694520..000000000 --- a/bangc-ops/kernels/unary_op/unary_op_union.h +++ /dev/null @@ -1,28 +0,0 @@ -/************************************************************************* - > File Name: union_task.h - > Author: wenzhengyin - > Mail: jones980116@163.com - > Created Time: Sun Jun 19 18:58:04 2022 - ************************************************************************/ -#ifndef UNARY_OP_UNION_H_ -#define UNARY_OP_UNION_H_ - -#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ - __mlu_global__ void MLUUnionKernel##Op##DType##Prefer(\ - cnrtFunctionType_t k_type, mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ - -#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ - __mlu_global__ void MLUUnionKernel##Op##DType##Prefer( \ - cnrtFunctionType_t k_type, mluOpDataType_t data_type, void* boxes_data_ptr, void* boxes_scores_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){\ - int setoff;\ - getOffsetNum##Op##Prefer(input_boxes_num, data_type, &setoff, k_type); \ - unionImple(\ - (DType*)boxes_data_ptr, (DType*)boxes_scores_ptr, (DType)nms_thres, setoff, input_boxes_num, output_boxes_index);} - -template -__mlu_device__ void unionImple(T* boxes_data_ptr, T* boxes_scores_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ - - OpFunc(boxes_data_ptr, boxes_scores_ptr, nms_thres, setoff, input_boxes_num, output_boxes_index); -} - -#endif From a2ddcea276723af62dbc97c57eca25be3920027e Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 16:37:33 +0800 Subject: [PATCH 06/36] Delete ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 74 ----------------------------- 1 file changed, 74 deletions(-) delete mode 100644 bangc-ops/kernels/ml_nms/ml_nms.cpp diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp deleted file mode 100644 index f59972a6a..000000000 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/************************************************************************* - > File Name: main.cpp - > Author: wenzhengyin - > Mail: jones980116@163.com - > Created Time: Tue Apr 19 14:35:06 2022 - ************************************************************************/ -#include "cnrt.h" -#include "cndev.h" -//#include "cnrt_data.h" -#include -#include -#include -#include -#include -#include -#include -#include "core/mlu_op_core.h" -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" -#include "mlu_op_kernel.h" -#include "kernels/unary_op/unary_op_host.h" -using namespace std; -typedef uint16_t half; - - -mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, - const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, - int input_boxes_num, float iou_threshold, uint8_t* output_boxes_index) { - - int setoff; - bool zero_element = false; - mluOpDataType_t data_type = MLUOP_DTYPE_HALF; - mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT}; - - cnrtDim3_t k_dim = {4, 1, 1}; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; - - mluOpStatus_t param_check = unaryOpNmsParamCheck( - "[mluOpMlNms]", boxes_data_ptr_desc, boxes_data_ptr, scores_max_boxes_data_ptr, support_type, 2, zero_element); - if(param_check != MLUOP_STATUS_SUCCESS){ - return param_check; - } - - void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, - void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nmsThres, int input_boxes_num, uint8_t* output_boxes_index); - //mluOpFuncKernel = NULL; - if (k_type == CNRT_FUNC_TYPE_BLOCK){ - if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ - mluOpFuncKernel = mluBlockKernelMlNmsHalfFast; - }else { - mluOpFuncKernel = mluBlockKernelMlNmsFloatFast; - } - }else { - if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ - mluOpFuncKernel = mluUnionKernelMlNmsHalfFast; - }else{ - mluOpFuncKernel = mluUnionKernelMlNmsFloatFast; - } - } - - KERNEL_CHECK( - (mluOpFuncKernel(k_dim, k_type, handle->queue, boxes_data_ptr_desc->dtype, boxes_data_ptr, scores_max_boxes_data_ptr, iou_threshold, input_boxes_num, output_boxes_index))); - GEN_CASE_END(); - - return MLUOP_STATUS_SUCCESS; - -} - - - From ee16a1c51078a6fb0a96de36728a1bd4edf84787 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 16:37:42 +0800 Subject: [PATCH 07/36] Delete ml_nms_block.mlu --- bangc-ops/kernels/ml_nms/ml_nms_block.mlu | 166 ---------------------- 1 file changed, 166 deletions(-) delete mode 100644 bangc-ops/kernels/ml_nms/ml_nms_block.mlu diff --git a/bangc-ops/kernels/ml_nms/ml_nms_block.mlu b/bangc-ops/kernels/ml_nms/ml_nms_block.mlu deleted file mode 100644 index be80bd2fc..000000000 --- a/bangc-ops/kernels/ml_nms/ml_nms_block.mlu +++ /dev/null @@ -1,166 +0,0 @@ -#include "bang.h" -#include "mlu_op_kernel.h" -#include "kernels/unary_op/unary_op_block.h" - -#define MLU_VERSION 270 -#if MLU_VERSION == 220 -#undef MLU_VERSION -#define MLU_VERSION 220 -#elif MLU_VERSION == 270 -#undef MLU_VERSION -#define MLU_VERSION 270 -#elif MLU_VERSION == 290 -#undef MLU_VERSION -#define MLU_VERSION 290 -#elif MLU_VERSION == 322 -#undef MLU_VERSION -#define MLU_VERSION 322 -#elif MLU_VERSION == 372 -#undef MLU_VERSION -#define MLU_VERSION 372 -#endif - -__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int* setoff){ - - switch (data_type){ - case MLUOP_DTYPE_HALF: - if((input_boxes_num % (min_cell / 2)) != 0){ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2) + 1); - }else{ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2)); - } - case MLUOP_DTYPE_FLOAT: - if ((input_boxes_num % (min_cell / 4)) != 0){ - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4) + 1); - } else { - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4)); - } - default:break; - } -} - -__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff){ - - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, setoff); - } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, setoff); - } -} - - - -template -__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ - - __nram__ T scores_max_boxes[4]; - __nram__ T scores_max_boxes_area; - __nram__ T boxes_data[512]; - __nram__ T x1[512]; - __nram__ T y1[512]; - __nram__ T x2[512]; - __nram__ T y2[512]; - __nram__ T w[512]; - __nram__ T h[512]; - __nram__ T area_ptr[512]; - __nram__ T interarea_ptr[512]; - __nram__ T scores_max_boxes_area_ptr[512]; - __nram__ T nms_thresPtr[512]; - __nram__ T scores_max_boxes_ptr[512]; - __nram__ T tem[512]; - __nram__ uint8_t result[512]; - - __memcpy(boxes_data, boxes_data_ptr, input_boxes_num * 4 * sizeof(T), GDRAM2NRAM); - __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); - - int j,i; - for(i = 0, j = 0; i < setoff; i++, j+=4){ - x1[i] = boxes_data[j + 0]; - y1[i] = boxes_data[j + 1]; - x2[i] = boxes_data[j + 2]; - y2[i] = boxes_data[j + 3]; - } - - - //-----------------iou detect-------------------- - - //fing all boxes area - __bang_sub(h, y1, y2, setoff); - __bang_sub(w, x2, x1, setoff); - __bang_mul(area_ptr, h, w, setoff); - - //max x1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); - __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(x1, x1, setoff); - __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); - - //min y1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(y1, y1, tem, setoff); - - //min x2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(x2, x2, tem, setoff); - - //max y2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); - __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(y2, y2, setoff); - __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); - - //--------- intesection------- - //fing W - __bang_sub(w, x2, x1, setoff); - __bang_active_relu(w, w, setoff); - - //find H - __bang_sub(h, y1, y2, setoff); - __bang_active_relu(h, h, setoff); - - //fing intersection - __bang_mul(interarea_ptr, h, w, setoff); - - //fing scores max boxes area - scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); - __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); - __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); - __bang_sub(tem, tem, interarea_ptr, setoff); - __bang_write_value(nms_thresPtr, setoff, nms_thres); - __bang_cycle_mul(tem, tem, nms_thresPtr, setoff, setoff); - __bang_ge(tem, interarea_ptr, tem, setoff); - - for(int i =0; i < setoff; i++){ - result[i] = (int)(tem[i]); - } - - __memcpy(output_boxes_index, result, input_boxes_num * sizeof(uint8_t), NRAM2GDRAM); -} - - -UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); -UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); - -void MLUOP_WIN_API mluBlockKernelMlNmsFloatFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUBlockKernelMlNmsfloatFast<<>>( - data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} - -void MLUOP_WIN_API mluBlockKernelMlNmsHalfFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUBlockKernelMlNmshalfFast<<>>( - data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} From edd4c1294d8798deff191fcd3c42bf0e48315159 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 16:37:52 +0800 Subject: [PATCH 08/36] Delete ml_nms_union.mlu --- bangc-ops/kernels/ml_nms/ml_nms_union.mlu | 201 ---------------------- 1 file changed, 201 deletions(-) delete mode 100644 bangc-ops/kernels/ml_nms/ml_nms_union.mlu diff --git a/bangc-ops/kernels/ml_nms/ml_nms_union.mlu b/bangc-ops/kernels/ml_nms/ml_nms_union.mlu deleted file mode 100644 index cc878d2e3..000000000 --- a/bangc-ops/kernels/ml_nms/ml_nms_union.mlu +++ /dev/null @@ -1,201 +0,0 @@ -#include "bang.h" -#include "mlu_op_kernel.h" -#include "kernels/unary_op/unary_op_union.h" - -#define MLU_VERSION 270 -#if MLU_VERSION == 220 -#undef MLU_VERSION -#define MLU_VERSION 220 -#elif MLU_VERSION == 270 -#undef MLU_VERSION -#define MLU_VERSION 270 -#elif MLU_VERSION == 290 -#undef MLU_VERSION -#define MLU_VERSION 290 -#elif MLU_VERSION == 322 -#undef MLU_VERSION -#define MLU_VERSION 322 -#elif MLU_VERSION == 372 -#undef MLU_VERSION -#define MLU_VERSION 372 -#endif - -__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int cord_num, int* setoff){ - switch (data_type){ - case MLUOP_DTYPE_HALF: - if((input_boxes_num % (min_cell / 2 * cord_num)) != 0){ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num) + 1); - }else{ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num)); - } - case MLUOP_DTYPE_FLOAT: - if ((input_boxes_num % (min_cell / 4 * cord_num)) != 0){ - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num) + 1); - } else { - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num)); - } - default:break; - } -} - -__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff, cnrtFunctionType_t k_type){ - if(k_type == CNRT_FUNC_TYPE_UNION1){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 4, setoff); - } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, 4, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, 4, setoff); - } - } else if(k_type == CNRT_FUNC_TYPE_UNION2){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 8, setoff); - } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, 8, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, 8, setoff); - } - } else if(k_type == CNRT_FUNC_TYPE_UNION4){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 16, setoff); - } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, 16, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, 16, setoff); - } - } else if(k_type == CNRT_FUNC_TYPE_UNION8){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 32, setoff); - } - } -} - - - -template -__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ - - __nram__ T scores_max_boxes[4]; - __nram__ T scores_max_boxes_area; - __nram__ T boxes_data[512]; - __nram__ T x1[512]; - __nram__ T y1[512]; - __nram__ T x2[512]; - __nram__ T y2[512]; - __nram__ T w[512]; - __nram__ T h[512]; - __nram__ T area_ptr[512]; - __nram__ T interarea_ptr[512]; - __nram__ T scores_max_boxes_area_ptr[512]; - __nram__ T nms_thres_ptr[512]; - __nram__ T scores_max_boxes_ptr[512]; - __nram__ T tem[512]; - __nram__ uint8_t result[512]; - - if (input_boxes_num % taskDim != 0){ - __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); - } else { - if (taskId == (taskDim - 1)){ - __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), ((input_boxes_num / taskDim) + (input_boxes_num % taskDim)) * 4 * sizeof(T), GDRAM2NRAM); - } else { - __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); - } - } - __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); - - int j,i; - for(i = 0, j = 0; i < setoff; i++, j+=4){ - x1[i] = boxes_data[j + 0]; - y1[i] = boxes_data[j + 1]; - x2[i] = boxes_data[j + 2]; - y2[i] = boxes_data[j + 3]; - } - - //-----------------iou detect-------------------- - //fing all boxes area - __bang_sub(h, y1, y2, setoff); - __bang_sub(w, x2, x1, setoff); - __bang_mul(area_ptr, h, w, setoff); - - //max x1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); - __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(x1, x1, setoff); - __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); - - //min y1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(y1, y1, tem, setoff); - - //min x2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(x2, x2, tem, setoff); - - //max y2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); - __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(y2, y2, setoff); - __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); - - //--------- intesection------- - //fing W - __bang_sub(w, x2, x1, setoff); - __bang_active_relu(w, w, setoff); - - //find H - __bang_sub(h, y1, y2, setoff); - __bang_active_relu(h, h, setoff); - - //fing intersection - __bang_mul(interarea_ptr, h, w, setoff); - - //fing scores max boxes area - scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); - __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); - __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); - __bang_sub(tem, tem, interarea_ptr, setoff); - __bang_write_value(nms_thres_ptr, setoff, nms_thres); - __bang_cycle_mul(tem, tem, nms_thres_ptr, setoff, setoff); - __bang_gt(tem, interarea_ptr, tem, setoff); - - for(int i =0; i < setoff; i++){ - result[i] = (int)(tem[i]); - } - - if (input_boxes_num % taskDim !=0){ - if (taskId == (taskDim - 1)){ - __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim + input_boxes_num % taskDim) * sizeof(uint8_t), NRAM2GDRAM); - } else { - __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); - } - } else { - __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); - } - -} - - -UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); -UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); - -void MLUOP_WIN_API mluUnionKernelMlNmsFloatFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUUnionKernelMlNmsfloatFast<<>>( - k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} - -void MLUOP_WIN_API mluUnionKernelMlNmsHalfFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUUnionKernelMlNmshalfFast<<>>( - k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} From 393e41e1fdd828b27ccc525c98720a5bd0d91696 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Fri, 12 Aug 2022 16:58:39 +0800 Subject: [PATCH 09/36] Add files via upload --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 74 ++++++++ bangc-ops/kernels/ml_nms/ml_nms_block.mlu | 166 ++++++++++++++++++ bangc-ops/kernels/ml_nms/ml_nms_union.mlu | 201 ++++++++++++++++++++++ 3 files changed, 441 insertions(+) create mode 100644 bangc-ops/kernels/ml_nms/ml_nms.cpp create mode 100644 bangc-ops/kernels/ml_nms/ml_nms_block.mlu create mode 100644 bangc-ops/kernels/ml_nms/ml_nms_union.mlu diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp new file mode 100644 index 000000000..f59972a6a --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -0,0 +1,74 @@ +/************************************************************************* + > File Name: main.cpp + > Author: wenzhengyin + > Mail: jones980116@163.com + > Created Time: Tue Apr 19 14:35:06 2022 + ************************************************************************/ +#include "cnrt.h" +#include "cndev.h" +//#include "cnrt_data.h" +#include +#include +#include +#include +#include +#include +#include +#include "core/mlu_op_core.h" +#include "core/context.h" +#include "core/gen_case.h" +#include "core/logging.h" +#include "core/runtime/device.h" +#include "core/tensor.h" +#include "core/type.h" +#include "mlu_op_kernel.h" +#include "kernels/unary_op/unary_op_host.h" +using namespace std; +typedef uint16_t half; + + +mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, + const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, + int input_boxes_num, float iou_threshold, uint8_t* output_boxes_index) { + + int setoff; + bool zero_element = false; + mluOpDataType_t data_type = MLUOP_DTYPE_HALF; + mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT}; + + cnrtDim3_t k_dim = {4, 1, 1}; + cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + + mluOpStatus_t param_check = unaryOpNmsParamCheck( + "[mluOpMlNms]", boxes_data_ptr_desc, boxes_data_ptr, scores_max_boxes_data_ptr, support_type, 2, zero_element); + if(param_check != MLUOP_STATUS_SUCCESS){ + return param_check; + } + + void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, + void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nmsThres, int input_boxes_num, uint8_t* output_boxes_index); + //mluOpFuncKernel = NULL; + if (k_type == CNRT_FUNC_TYPE_BLOCK){ + if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ + mluOpFuncKernel = mluBlockKernelMlNmsHalfFast; + }else { + mluOpFuncKernel = mluBlockKernelMlNmsFloatFast; + } + }else { + if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ + mluOpFuncKernel = mluUnionKernelMlNmsHalfFast; + }else{ + mluOpFuncKernel = mluUnionKernelMlNmsFloatFast; + } + } + + KERNEL_CHECK( + (mluOpFuncKernel(k_dim, k_type, handle->queue, boxes_data_ptr_desc->dtype, boxes_data_ptr, scores_max_boxes_data_ptr, iou_threshold, input_boxes_num, output_boxes_index))); + GEN_CASE_END(); + + return MLUOP_STATUS_SUCCESS; + +} + + + diff --git a/bangc-ops/kernels/ml_nms/ml_nms_block.mlu b/bangc-ops/kernels/ml_nms/ml_nms_block.mlu new file mode 100644 index 000000000..be80bd2fc --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms_block.mlu @@ -0,0 +1,166 @@ +#include "bang.h" +#include "mlu_op_kernel.h" +#include "kernels/unary_op/unary_op_block.h" + +#define MLU_VERSION 270 +#if MLU_VERSION == 220 +#undef MLU_VERSION +#define MLU_VERSION 220 +#elif MLU_VERSION == 270 +#undef MLU_VERSION +#define MLU_VERSION 270 +#elif MLU_VERSION == 290 +#undef MLU_VERSION +#define MLU_VERSION 290 +#elif MLU_VERSION == 322 +#undef MLU_VERSION +#define MLU_VERSION 322 +#elif MLU_VERSION == 372 +#undef MLU_VERSION +#define MLU_VERSION 372 +#endif + +__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int* setoff){ + + switch (data_type){ + case MLUOP_DTYPE_HALF: + if((input_boxes_num % (min_cell / 2)) != 0){ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2) + 1); + }else{ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2)); + } + case MLUOP_DTYPE_FLOAT: + if ((input_boxes_num % (min_cell / 4)) != 0){ + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4) + 1); + } else { + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4)); + } + default:break; + } +} + +__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff){ + + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, setoff); + } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, setoff); + } +} + + + +template +__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ + + __nram__ T scores_max_boxes[4]; + __nram__ T scores_max_boxes_area; + __nram__ T boxes_data[512]; + __nram__ T x1[512]; + __nram__ T y1[512]; + __nram__ T x2[512]; + __nram__ T y2[512]; + __nram__ T w[512]; + __nram__ T h[512]; + __nram__ T area_ptr[512]; + __nram__ T interarea_ptr[512]; + __nram__ T scores_max_boxes_area_ptr[512]; + __nram__ T nms_thresPtr[512]; + __nram__ T scores_max_boxes_ptr[512]; + __nram__ T tem[512]; + __nram__ uint8_t result[512]; + + __memcpy(boxes_data, boxes_data_ptr, input_boxes_num * 4 * sizeof(T), GDRAM2NRAM); + __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); + + int j,i; + for(i = 0, j = 0; i < setoff; i++, j+=4){ + x1[i] = boxes_data[j + 0]; + y1[i] = boxes_data[j + 1]; + x2[i] = boxes_data[j + 2]; + y2[i] = boxes_data[j + 3]; + } + + + //-----------------iou detect-------------------- + + //fing all boxes area + __bang_sub(h, y1, y2, setoff); + __bang_sub(w, x2, x1, setoff); + __bang_mul(area_ptr, h, w, setoff); + + //max x1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); + __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(x1, x1, setoff); + __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); + + //min y1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(y1, y1, tem, setoff); + + //min x2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(x2, x2, tem, setoff); + + //max y2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); + __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(y2, y2, setoff); + __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); + + //--------- intesection------- + //fing W + __bang_sub(w, x2, x1, setoff); + __bang_active_relu(w, w, setoff); + + //find H + __bang_sub(h, y1, y2, setoff); + __bang_active_relu(h, h, setoff); + + //fing intersection + __bang_mul(interarea_ptr, h, w, setoff); + + //fing scores max boxes area + scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); + __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); + __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); + __bang_sub(tem, tem, interarea_ptr, setoff); + __bang_write_value(nms_thresPtr, setoff, nms_thres); + __bang_cycle_mul(tem, tem, nms_thresPtr, setoff, setoff); + __bang_ge(tem, interarea_ptr, tem, setoff); + + for(int i =0; i < setoff; i++){ + result[i] = (int)(tem[i]); + } + + __memcpy(output_boxes_index, result, input_boxes_num * sizeof(uint8_t), NRAM2GDRAM); +} + + +UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); +UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); + +void MLUOP_WIN_API mluBlockKernelMlNmsFloatFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUBlockKernelMlNmsfloatFast<<>>( + data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} + +void MLUOP_WIN_API mluBlockKernelMlNmsHalfFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUBlockKernelMlNmshalfFast<<>>( + data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} diff --git a/bangc-ops/kernels/ml_nms/ml_nms_union.mlu b/bangc-ops/kernels/ml_nms/ml_nms_union.mlu new file mode 100644 index 000000000..cc878d2e3 --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms_union.mlu @@ -0,0 +1,201 @@ +#include "bang.h" +#include "mlu_op_kernel.h" +#include "kernels/unary_op/unary_op_union.h" + +#define MLU_VERSION 270 +#if MLU_VERSION == 220 +#undef MLU_VERSION +#define MLU_VERSION 220 +#elif MLU_VERSION == 270 +#undef MLU_VERSION +#define MLU_VERSION 270 +#elif MLU_VERSION == 290 +#undef MLU_VERSION +#define MLU_VERSION 290 +#elif MLU_VERSION == 322 +#undef MLU_VERSION +#define MLU_VERSION 322 +#elif MLU_VERSION == 372 +#undef MLU_VERSION +#define MLU_VERSION 372 +#endif + +__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int cord_num, int* setoff){ + switch (data_type){ + case MLUOP_DTYPE_HALF: + if((input_boxes_num % (min_cell / 2 * cord_num)) != 0){ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num) + 1); + }else{ + *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num)); + } + case MLUOP_DTYPE_FLOAT: + if ((input_boxes_num % (min_cell / 4 * cord_num)) != 0){ + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num) + 1); + } else { + *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num)); + } + default:break; + } +} + +__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff, cnrtFunctionType_t k_type){ + if(k_type == CNRT_FUNC_TYPE_UNION1){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 4, setoff); + } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, 4, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, 4, setoff); + } + } else if(k_type == CNRT_FUNC_TYPE_UNION2){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 8, setoff); + } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, 8, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, 8, setoff); + } + } else if(k_type == CNRT_FUNC_TYPE_UNION4){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 16, setoff); + } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ + setSetoff(data_type, input_boxes_num, 128, 16, setoff); + } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ + setSetoff(data_type, input_boxes_num, 128, 16, setoff); + } + } else if(k_type == CNRT_FUNC_TYPE_UNION8){ + if (MLU_VERSION < 200){ + setSetoff(data_type, input_boxes_num, 64, 32, setoff); + } + } +} + + + +template +__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ + + __nram__ T scores_max_boxes[4]; + __nram__ T scores_max_boxes_area; + __nram__ T boxes_data[512]; + __nram__ T x1[512]; + __nram__ T y1[512]; + __nram__ T x2[512]; + __nram__ T y2[512]; + __nram__ T w[512]; + __nram__ T h[512]; + __nram__ T area_ptr[512]; + __nram__ T interarea_ptr[512]; + __nram__ T scores_max_boxes_area_ptr[512]; + __nram__ T nms_thres_ptr[512]; + __nram__ T scores_max_boxes_ptr[512]; + __nram__ T tem[512]; + __nram__ uint8_t result[512]; + + if (input_boxes_num % taskDim != 0){ + __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); + } else { + if (taskId == (taskDim - 1)){ + __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), ((input_boxes_num / taskDim) + (input_boxes_num % taskDim)) * 4 * sizeof(T), GDRAM2NRAM); + } else { + __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); + } + } + __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); + + int j,i; + for(i = 0, j = 0; i < setoff; i++, j+=4){ + x1[i] = boxes_data[j + 0]; + y1[i] = boxes_data[j + 1]; + x2[i] = boxes_data[j + 2]; + y2[i] = boxes_data[j + 3]; + } + + //-----------------iou detect-------------------- + //fing all boxes area + __bang_sub(h, y1, y2, setoff); + __bang_sub(w, x2, x1, setoff); + __bang_mul(area_ptr, h, w, setoff); + + //max x1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); + __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(x1, x1, setoff); + __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); + + //min y1 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(y1, y1, tem, setoff); + + //min x2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); + __bang_write_zero(tem, setoff); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); + __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); + __bang_active_relu(tem, tem, setoff); + __bang_sub(x2, x2, tem, setoff); + + //max y2 + __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); + __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); + __bang_active_relu(y2, y2, setoff); + __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); + + //--------- intesection------- + //fing W + __bang_sub(w, x2, x1, setoff); + __bang_active_relu(w, w, setoff); + + //find H + __bang_sub(h, y1, y2, setoff); + __bang_active_relu(h, h, setoff); + + //fing intersection + __bang_mul(interarea_ptr, h, w, setoff); + + //fing scores max boxes area + scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); + __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); + __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); + __bang_sub(tem, tem, interarea_ptr, setoff); + __bang_write_value(nms_thres_ptr, setoff, nms_thres); + __bang_cycle_mul(tem, tem, nms_thres_ptr, setoff, setoff); + __bang_gt(tem, interarea_ptr, tem, setoff); + + for(int i =0; i < setoff; i++){ + result[i] = (int)(tem[i]); + } + + if (input_boxes_num % taskDim !=0){ + if (taskId == (taskDim - 1)){ + __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim + input_boxes_num % taskDim) * sizeof(uint8_t), NRAM2GDRAM); + } else { + __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); + } + } else { + __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); + } + +} + + +UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); +UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); + +void MLUOP_WIN_API mluUnionKernelMlNmsFloatFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUUnionKernelMlNmsfloatFast<<>>( + k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} + +void MLUOP_WIN_API mluUnionKernelMlNmsHalfFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ + MLUUnionKernelMlNmshalfFast<<>>( + k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); +} From 487b228cd3a7fe97ffff5f7d7e409baf6925e1ee Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:41:47 +0800 Subject: [PATCH 10/36] Update mlu_op.h --- bangc-ops/mlu_op.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/bangc-ops/mlu_op.h b/bangc-ops/mlu_op.h index 2efc51d33..342ede63b 100755 --- a/bangc-ops/mlu_op.h +++ b/bangc-ops/mlu_op.h @@ -1464,6 +1464,30 @@ mluOpGetTensorAndDataFromTensorSet(mluOpTensorSetDescriptor_t tensorSetDesc, mluOpTensorDescriptor_t *tensorDesc, void **dataAddrInDevice); +/* + * + * @param handle : Set the handle to the MLU + * + * @param mluOpTensorDescriptor_t : Properties of the input data + * + * @param boxesDataPtr : The coordinates of the input box + * + * @param scoresMaxBoxesDataPtr : Coordin of the box with maximum accuracy + * + * @param inputBoxesNum : input box number + * + * @param iouThreshold : Threshold of intersection and union ratio + * + * @param outputBoxesIndex : Index of the output box + * + */ + +mluOpStatus_t MLUOP_WIN_API +mluOpMlNms(mluOpHandle_t handle, + const mluOpTensorDescriptor_t boxes_data_ptr_desc, + void* boxes_data_ptr, + float iou_threshold, + void* output_boxes_index); // Group:Abs /*! * @brief Computes the absolute value for every element of the input tensor \b x From 3ee782a1e524898151aeb3e2e5055d4698aed54a Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:44:55 +0800 Subject: [PATCH 11/36] Update mlu_op_kernel.h --- bangc-ops/mlu_op_kernel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bangc-ops/mlu_op_kernel.h b/bangc-ops/mlu_op_kernel.h index d211a7e74..d5ec76aa6 100644 --- a/bangc-ops/mlu_op_kernel.h +++ b/bangc-ops/mlu_op_kernel.h @@ -25,6 +25,7 @@ #include #include "cnrt.h" +#include "mlu_op.h" #ifndef MLUOP_WIN_API #ifdef _WIN32 @@ -38,6 +39,17 @@ extern "C" { #endif // __cplusplus +/* ml_nms */ +void MLUOP_WIN_API mluOpKernelMlNmsFloatFast(cnrtDim3_t k_dim, + cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, + void* boxes_data_ptr, float nms_thres, int input_boxes_num, + uint8_t* output_boxes_index); + +void MLUOP_WIN_API mluOpKernelMlNmsHalfFast(cnrtDim3_t k_dim, + cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, + void* boxes_data_ptr, float nms_thres, int input_boxes_num, + uint8_t* output_boxes_index); + /* Abs */ void MLUOP_WIN_API mluOpBlockKernel3StagePipelineAbsHalfFast( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, From 36c5926fb8008c5f546bceb2d2eaeea3a97df3ed Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:49:01 +0800 Subject: [PATCH 12/36] Update ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 145 +++++++++++++++++----------- 1 file changed, 91 insertions(+), 54 deletions(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index f59972a6a..467747ac9 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -1,20 +1,27 @@ /************************************************************************* - > File Name: main.cpp - > Author: wenzhengyin - > Mail: jones980116@163.com - > Created Time: Tue Apr 19 14:35:06 2022 - ************************************************************************/ -#include "cnrt.h" -#include "cndev.h" -//#include "cnrt_data.h" +* Copyright (C) [2022] by Cambricon, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +************************************************************************/ #include -#include -#include -#include -#include -#include -#include -#include "core/mlu_op_core.h" +#include "mlu_op.h" #include "core/context.h" #include "core/gen_case.h" #include "core/logging.h" @@ -22,53 +29,83 @@ #include "core/tensor.h" #include "core/type.h" #include "mlu_op_kernel.h" -#include "kernels/unary_op/unary_op_host.h" -using namespace std; -typedef uint16_t half; +#include "cnrt.h" +#include "cndev.h" +static inline bool isSupportType(const mluOpDataType_t check_type, + const mluOpDataType_t support_type[], + const int len) { + for (int i = 0; i < len; ++i) { + if (check_type == support_type[i]) { + return true; + } + } + return false; +} -mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, - const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, - int input_boxes_num, float iou_threshold, uint8_t* output_boxes_index) { - - int setoff; - bool zero_element = false; - mluOpDataType_t data_type = MLUOP_DTYPE_HALF; - mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT}; +mluOpStatus_t MlNmsParamCheck( + const std::string &op_name, const mluOpHandle_t &handle, + const mluOpTensorDescriptor_t &x_desc, const void *x, + const mluOpDataType_t support_type[], const int &len) { + PARAM_CHECK(op_name, x_desc != NULL); + PARAM_CHECK(op_name, handle != NULL); + + // check data type + if (!isSupportType(x_desc->dtype, support_type, len)) { + LOG(ERROR) << op_name << ":x_desc's data type is not supported."; + return MLUOP_STATUS_BAD_PARAM; + } + PARAM_CHECK(op_name, x != NULL); + return MLUOP_STATUS_SUCCESS; +} - cnrtDim3_t k_dim = {4, 1, 1}; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; - mluOpStatus_t param_check = unaryOpNmsParamCheck( - "[mluOpMlNms]", boxes_data_ptr_desc, boxes_data_ptr, scores_max_boxes_data_ptr, support_type, 2, zero_element); - if(param_check != MLUOP_STATUS_SUCCESS){ - return param_check; - } +static void policyFunc(const mluOpHandle_t &handle, + const mluOpTensorDescriptor_t desc, cnrtDim3_t *k_dim, + cnrtFunctionType_t *k_type) { + size_t dim = mluOpGetTensorElementNum(desc); + // Union1 policyFunc + *k_type = CNRT_FUNC_TYPE_UNION1; + k_dim->x = handle->core_num_per_cluster; + k_dim->y = mluop::runtime::getClusterLimitCapability(handle); + k_dim->z = 1; + // if a case is smaller than 2048 , it just need one cluster can work best. + size_t small_case_thread = 2048; + if (dim <= small_case_thread) k_dim->y = 1; +} - void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, - void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nmsThres, int input_boxes_num, uint8_t* output_boxes_index); - //mluOpFuncKernel = NULL; - if (k_type == CNRT_FUNC_TYPE_BLOCK){ - if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ - mluOpFuncKernel = mluBlockKernelMlNmsHalfFast; - }else { - mluOpFuncKernel = mluBlockKernelMlNmsFloatFast; - } - }else { - if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF){ - mluOpFuncKernel = mluUnionKernelMlNmsHalfFast; - }else{ - mluOpFuncKernel = mluUnionKernelMlNmsFloatFast; - } - } +mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, + const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr, + float iou_threshold, void* output_boxes_index) { - KERNEL_CHECK( - (mluOpFuncKernel(k_dim, k_type, handle->queue, boxes_data_ptr_desc->dtype, boxes_data_ptr, scores_max_boxes_data_ptr, iou_threshold, input_boxes_num, output_boxes_index))); - GEN_CASE_END(); + mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT}; + mluOpStatus_t param_check = MlNmsParamCheck( + "[mluOpMlNms]", handle, boxes_data_ptr_desc, boxes_data_ptr, + support_type, 2); - return MLUOP_STATUS_SUCCESS; + if (param_check != MLUOP_STATUS_SUCCESS) { + return param_check; + } -} + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + policyFunc(handle, boxes_data_ptr_desc, &k_dim, &k_type); + int input_boxes_num = boxes_data_ptr_desc->total_element_num / 4; + void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, mluOpDataType_t data_type, void* boxes_data_ptr, + float nmsThres, int input_boxes_num, uint8_t* output_boxes_index); + if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF) { + mluOpFuncKernel = mluOpKernelMlNmsHalfFast; + } else { + mluOpFuncKernel = mluOpKernelMlNmsFloatFast; + } + KERNEL_CHECK( + (mluOpFuncKernel(k_dim, k_type, handle->queue, + boxes_data_ptr_desc->dtype, boxes_data_ptr, + iou_threshold, input_boxes_num, (uint8_t*)output_boxes_index))); + GEN_CASE_END(); + return MLUOP_STATUS_SUCCESS; +} From 69570d71eb06fd6b3d6eae1ae9cb931c221a0e71 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:49:19 +0800 Subject: [PATCH 13/36] Delete ml_nms_block.mlu --- bangc-ops/kernels/ml_nms/ml_nms_block.mlu | 166 ---------------------- 1 file changed, 166 deletions(-) delete mode 100644 bangc-ops/kernels/ml_nms/ml_nms_block.mlu diff --git a/bangc-ops/kernels/ml_nms/ml_nms_block.mlu b/bangc-ops/kernels/ml_nms/ml_nms_block.mlu deleted file mode 100644 index be80bd2fc..000000000 --- a/bangc-ops/kernels/ml_nms/ml_nms_block.mlu +++ /dev/null @@ -1,166 +0,0 @@ -#include "bang.h" -#include "mlu_op_kernel.h" -#include "kernels/unary_op/unary_op_block.h" - -#define MLU_VERSION 270 -#if MLU_VERSION == 220 -#undef MLU_VERSION -#define MLU_VERSION 220 -#elif MLU_VERSION == 270 -#undef MLU_VERSION -#define MLU_VERSION 270 -#elif MLU_VERSION == 290 -#undef MLU_VERSION -#define MLU_VERSION 290 -#elif MLU_VERSION == 322 -#undef MLU_VERSION -#define MLU_VERSION 322 -#elif MLU_VERSION == 372 -#undef MLU_VERSION -#define MLU_VERSION 372 -#endif - -__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int* setoff){ - - switch (data_type){ - case MLUOP_DTYPE_HALF: - if((input_boxes_num % (min_cell / 2)) != 0){ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2) + 1); - }else{ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2)); - } - case MLUOP_DTYPE_FLOAT: - if ((input_boxes_num % (min_cell / 4)) != 0){ - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4) + 1); - } else { - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4)); - } - default:break; - } -} - -__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff){ - - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, setoff); - } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, setoff); - } -} - - - -template -__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ - - __nram__ T scores_max_boxes[4]; - __nram__ T scores_max_boxes_area; - __nram__ T boxes_data[512]; - __nram__ T x1[512]; - __nram__ T y1[512]; - __nram__ T x2[512]; - __nram__ T y2[512]; - __nram__ T w[512]; - __nram__ T h[512]; - __nram__ T area_ptr[512]; - __nram__ T interarea_ptr[512]; - __nram__ T scores_max_boxes_area_ptr[512]; - __nram__ T nms_thresPtr[512]; - __nram__ T scores_max_boxes_ptr[512]; - __nram__ T tem[512]; - __nram__ uint8_t result[512]; - - __memcpy(boxes_data, boxes_data_ptr, input_boxes_num * 4 * sizeof(T), GDRAM2NRAM); - __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); - - int j,i; - for(i = 0, j = 0; i < setoff; i++, j+=4){ - x1[i] = boxes_data[j + 0]; - y1[i] = boxes_data[j + 1]; - x2[i] = boxes_data[j + 2]; - y2[i] = boxes_data[j + 3]; - } - - - //-----------------iou detect-------------------- - - //fing all boxes area - __bang_sub(h, y1, y2, setoff); - __bang_sub(w, x2, x1, setoff); - __bang_mul(area_ptr, h, w, setoff); - - //max x1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); - __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(x1, x1, setoff); - __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); - - //min y1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(y1, y1, tem, setoff); - - //min x2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(x2, x2, tem, setoff); - - //max y2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); - __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(y2, y2, setoff); - __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); - - //--------- intesection------- - //fing W - __bang_sub(w, x2, x1, setoff); - __bang_active_relu(w, w, setoff); - - //find H - __bang_sub(h, y1, y2, setoff); - __bang_active_relu(h, h, setoff); - - //fing intersection - __bang_mul(interarea_ptr, h, w, setoff); - - //fing scores max boxes area - scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); - __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); - __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); - __bang_sub(tem, tem, interarea_ptr, setoff); - __bang_write_value(nms_thresPtr, setoff, nms_thres); - __bang_cycle_mul(tem, tem, nms_thresPtr, setoff, setoff); - __bang_ge(tem, interarea_ptr, tem, setoff); - - for(int i =0; i < setoff; i++){ - result[i] = (int)(tem[i]); - } - - __memcpy(output_boxes_index, result, input_boxes_num * sizeof(uint8_t), NRAM2GDRAM); -} - - -UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); -UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); - -void MLUOP_WIN_API mluBlockKernelMlNmsFloatFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUBlockKernelMlNmsfloatFast<<>>( - data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} - -void MLUOP_WIN_API mluBlockKernelMlNmsHalfFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUBlockKernelMlNmshalfFast<<>>( - data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} From c6d90ac9d906304fa7e6c26b80d4adc9b7e0412a Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:49:29 +0800 Subject: [PATCH 14/36] Delete ml_nms_union.mlu --- bangc-ops/kernels/ml_nms/ml_nms_union.mlu | 201 ---------------------- 1 file changed, 201 deletions(-) delete mode 100644 bangc-ops/kernels/ml_nms/ml_nms_union.mlu diff --git a/bangc-ops/kernels/ml_nms/ml_nms_union.mlu b/bangc-ops/kernels/ml_nms/ml_nms_union.mlu deleted file mode 100644 index cc878d2e3..000000000 --- a/bangc-ops/kernels/ml_nms/ml_nms_union.mlu +++ /dev/null @@ -1,201 +0,0 @@ -#include "bang.h" -#include "mlu_op_kernel.h" -#include "kernels/unary_op/unary_op_union.h" - -#define MLU_VERSION 270 -#if MLU_VERSION == 220 -#undef MLU_VERSION -#define MLU_VERSION 220 -#elif MLU_VERSION == 270 -#undef MLU_VERSION -#define MLU_VERSION 270 -#elif MLU_VERSION == 290 -#undef MLU_VERSION -#define MLU_VERSION 290 -#elif MLU_VERSION == 322 -#undef MLU_VERSION -#define MLU_VERSION 322 -#elif MLU_VERSION == 372 -#undef MLU_VERSION -#define MLU_VERSION 372 -#endif - -__mlu_func__ void setSetoff(mluOpDataType_t data_type, int input_boxes_num, int min_cell, int cord_num, int* setoff){ - switch (data_type){ - case MLUOP_DTYPE_HALF: - if((input_boxes_num % (min_cell / 2 * cord_num)) != 0){ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num) + 1); - }else{ - *setoff = (min_cell / 2) * (input_boxes_num / (min_cell / 2 * cord_num)); - } - case MLUOP_DTYPE_FLOAT: - if ((input_boxes_num % (min_cell / 4 * cord_num)) != 0){ - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num) + 1); - } else { - *setoff = (min_cell / 4) * (input_boxes_num / (min_cell / 4 * cord_num)); - } - default:break; - } -} - -__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, mluOpDataType_t data_type, int* setoff, cnrtFunctionType_t k_type){ - if(k_type == CNRT_FUNC_TYPE_UNION1){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 4, setoff); - } else if (MLU_VERSION > 200 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, 4, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, 4, setoff); - } - } else if(k_type == CNRT_FUNC_TYPE_UNION2){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 8, setoff); - } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, 8, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, 8, setoff); - } - } else if(k_type == CNRT_FUNC_TYPE_UNION4){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 16, setoff); - } else if (MLU_VERSION > 250 && MLU_VERSION < 300){ - setSetoff(data_type, input_boxes_num, 128, 16, setoff); - } else if (MLU_VERSION > 300 && MLU_VERSION < 400){ - setSetoff(data_type, input_boxes_num, 128, 16, setoff); - } - } else if(k_type == CNRT_FUNC_TYPE_UNION8){ - if (MLU_VERSION < 200){ - setSetoff(data_type, input_boxes_num, 64, 32, setoff); - } - } -} - - - -template -__mlu_func__ void computeMlNmsFast(T* boxes_data_ptr, T* scores_max_boxes_data_ptr, T nms_thres, int setoff, int input_boxes_num, uint8_t* output_boxes_index){ - - __nram__ T scores_max_boxes[4]; - __nram__ T scores_max_boxes_area; - __nram__ T boxes_data[512]; - __nram__ T x1[512]; - __nram__ T y1[512]; - __nram__ T x2[512]; - __nram__ T y2[512]; - __nram__ T w[512]; - __nram__ T h[512]; - __nram__ T area_ptr[512]; - __nram__ T interarea_ptr[512]; - __nram__ T scores_max_boxes_area_ptr[512]; - __nram__ T nms_thres_ptr[512]; - __nram__ T scores_max_boxes_ptr[512]; - __nram__ T tem[512]; - __nram__ uint8_t result[512]; - - if (input_boxes_num % taskDim != 0){ - __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); - } else { - if (taskId == (taskDim - 1)){ - __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), ((input_boxes_num / taskDim) + (input_boxes_num % taskDim)) * 4 * sizeof(T), GDRAM2NRAM); - } else { - __memcpy(boxes_data, boxes_data_ptr + (taskId * (input_boxes_num / taskDim) * 4), (input_boxes_num / taskDim) * 4 * sizeof(T), GDRAM2NRAM); - } - } - __memcpy(scores_max_boxes, scores_max_boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); - - int j,i; - for(i = 0, j = 0; i < setoff; i++, j+=4){ - x1[i] = boxes_data[j + 0]; - y1[i] = boxes_data[j + 1]; - x2[i] = boxes_data[j + 2]; - y2[i] = boxes_data[j + 3]; - } - - //-----------------iou detect-------------------- - //fing all boxes area - __bang_sub(h, y1, y2, setoff); - __bang_sub(w, x2, x1, setoff); - __bang_mul(area_ptr, h, w, setoff); - - //max x1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[0]); - __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(x1, x1, setoff); - __bang_cycle_add(x1, x1, scores_max_boxes_ptr, setoff, setoff); - - //min y1 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[1]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, y1, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(y1, y1, tem, setoff); - - //min x2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[2]); - __bang_write_zero(tem, setoff); - __bang_cycle_add(tem, tem, scores_max_boxes_ptr, setoff, setoff); - __bang_sub(tem, x2, scores_max_boxes_ptr, setoff); - __bang_active_relu(tem, tem, setoff); - __bang_sub(x2, x2, tem, setoff); - - //max y2 - __bang_write_value(scores_max_boxes_ptr, setoff, scores_max_boxes[3]); - __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, setoff, setoff); - __bang_active_relu(y2, y2, setoff); - __bang_cycle_add(y2, y2, scores_max_boxes_ptr, setoff, setoff); - - //--------- intesection------- - //fing W - __bang_sub(w, x2, x1, setoff); - __bang_active_relu(w, w, setoff); - - //find H - __bang_sub(h, y1, y2, setoff); - __bang_active_relu(h, h, setoff); - - //fing intersection - __bang_mul(interarea_ptr, h, w, setoff); - - //fing scores max boxes area - scores_max_boxes_area=(scores_max_boxes[1] - scores_max_boxes[3]) * (scores_max_boxes[2] - scores_max_boxes[0]); - __bang_write_value(scores_max_boxes_area_ptr, setoff, scores_max_boxes_area); - __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, setoff, setoff); - __bang_sub(tem, tem, interarea_ptr, setoff); - __bang_write_value(nms_thres_ptr, setoff, nms_thres); - __bang_cycle_mul(tem, tem, nms_thres_ptr, setoff, setoff); - __bang_gt(tem, interarea_ptr, tem, setoff); - - for(int i =0; i < setoff; i++){ - result[i] = (int)(tem[i]); - } - - if (input_boxes_num % taskDim !=0){ - if (taskId == (taskDim - 1)){ - __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim + input_boxes_num % taskDim) * sizeof(uint8_t), NRAM2GDRAM); - } else { - __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); - } - } else { - __memcpy(output_boxes_index + (taskId * (input_boxes_num / taskDim)), result, (input_boxes_num / taskDim) * sizeof(uint8_t), NRAM2GDRAM); - } - -} - - -UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); -UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); - -void MLUOP_WIN_API mluUnionKernelMlNmsFloatFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUUnionKernelMlNmsfloatFast<<>>( - k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} - -void MLUOP_WIN_API mluUnionKernelMlNmsHalfFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, void* scores_max_boxes_data_ptr, float nms_thres, int input_boxes_num, uint8_t* output_boxes_index){ - MLUUnionKernelMlNmshalfFast<<>>( - k_type, data_type, boxes_data_ptr, scores_max_boxes_data_ptr, nms_thres, input_boxes_num, output_boxes_index); -} From ed65307b00b7d7430648c0e3d049b15ddbd6b552 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:49:37 +0800 Subject: [PATCH 15/36] Delete test.txt --- bangc-ops/kernels/ml_nms/test.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 bangc-ops/kernels/ml_nms/test.txt diff --git a/bangc-ops/kernels/ml_nms/test.txt b/bangc-ops/kernels/ml_nms/test.txt deleted file mode 100644 index 8b1378917..000000000 --- a/bangc-ops/kernels/ml_nms/test.txt +++ /dev/null @@ -1 +0,0 @@ - From 394f998b4ee57cee2e88c6e700ff6a9d9823d125 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:50:52 +0800 Subject: [PATCH 16/36] Create ml_nms.h --- bangc-ops/kernels/ml_nms/ml_nms.h | 53 +++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 bangc-ops/kernels/ml_nms/ml_nms.h diff --git a/bangc-ops/kernels/ml_nms/ml_nms.h b/bangc-ops/kernels/ml_nms/ml_nms.h new file mode 100644 index 000000000..2aa63f7e9 --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms.h @@ -0,0 +1,53 @@ +/************************************************************************* +* Copyright (C) [2022] by Cambricon, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ************************************************************************/ +#ifndef UNARY_OP_BLOCK_H_ +#define UNARY_OP_BLOCK_H_ +#include "kernels/kernel.h" +#define NRAM_SIZE 2 * 1024 +#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ + __mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\ + mluOpDataType_t data_type, void* boxes_data_ptr, \ + float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ + +#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ + __mlu_global__ void MLUOpKernel##Op##DType##Prefer( \ + mluOpDataType_t data_type, void* boxes_data_ptr, \ + float nms_thres, int input_boxes_num, uint8_t* output_boxes_index) {\ + int offset, seg; \ + getOffsetNum##Op##Prefer(input_boxes_num, &offset); \ + getSegNumMlNmsFast(input_boxes_num, &seg); \ + unionImple( \ + (DType*)boxes_data_ptr, (DType)nms_thres, \ + offset, seg, input_boxes_num, output_boxes_index);} + +template +__mlu_device__ void unionImple(T* boxes_data_ptr, T nms_thres, int offset, + int seg, int input_boxes_num, uint8_t* output_boxes_index) { + __nram__ char worke_space[MAX_NRAM_SIZE / 16]; + __memcpy((T*)worke_space, boxes_data_ptr + (offset * 4), seg * 4 * sizeof(T), GDRAM2NRAM); + __memcpy((T*)worke_space + (seg * 4), boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); + OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset, + seg, output_boxes_index); +} + +#endif From 01efa33cefa30c7bbf0a866192f3e5d6af10e162 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:51:36 +0800 Subject: [PATCH 17/36] Create ml_nms.mlu --- bangc-ops/kernels/ml_nms/ml_nms.mlu | 197 ++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 bangc-ops/kernels/ml_nms/ml_nms.mlu diff --git a/bangc-ops/kernels/ml_nms/ml_nms.mlu b/bangc-ops/kernels/ml_nms/ml_nms.mlu new file mode 100644 index 000000000..b9f2aca51 --- /dev/null +++ b/bangc-ops/kernels/ml_nms/ml_nms.mlu @@ -0,0 +1,197 @@ +/************************************************************************* +* Copyright (C) [2022] by Cambricon, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +************************************************************************/ +#include "bang.h" +#include "mlu_op_kernel.h" +#include "ml_nms.h" + +__mlu_func__ void getComputeLen(int seg, int elem_byte, int* compute_len) { +#if (__BANG_ARCH__ < 200) + *compute_len = (seg * elem_byte % 64) == 0 ? + seg : (seg * elem_byte / 64 + 1) * 64 / elem_byte; +#elif (__BANG_ARCH__ > 200 && __BANG_ARCH__ < 300) + *compute_len = (seg * elem_byte % 128) == 0 ? + seg : (seg * elem_byte / 128 + 1) * 128 / elem_byte; +#elif (__BANG_ARCH__ > 300) + *compute_len = seg; +#endif +} +__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, int* offset) { + if (taskDim > 1) { + *offset = (input_boxes_num % taskDim) > taskId ? + (input_boxes_num / taskDim + 1) * taskId : + (input_boxes_num / taskDim) * taskId + (input_boxes_num % taskDim); + } else { + *offset = input_boxes_num; + } +} + +__mlu_func__ void getSegNumMlNmsFast(int input_boxes_num, int* seg) { + if (taskDim > 1) { + *seg = (input_boxes_num / taskDim) + + uint32_t((input_boxes_num % taskDim) > taskId); + } else { + *seg = input_boxes_num; + } +} + +template +__mlu_func__ void computeMlNmsFast(T* worke_space, + T nms_thres, int input_boxes_num, int offset, + int seg, uint8_t* output_boxes_index) { + __nram__ T scores_max_boxes_area; + __nram__ T w_s, h_s; + __nram__ T* scores_max_boxes; + __nram__ T* x1; + __nram__ T* y1; + __nram__ T* x2; + __nram__ T* y2; + __nram__ T* w; + __nram__ T* h; + __nram__ T* area_ptr; + __nram__ T* inter_area_ptr; + __nram__ T* scores_max_boxes_area_ptr; + __nram__ T* nms_thres_ptr; + __nram__ T* scores_max_boxes_ptr; + __nram__ T* tem; + __nram__ uint8_t* result; + int compute_len; + int i, j; + int data_len = seg * 4 + 4; + + getComputeLen(seg, sizeof(T), &compute_len); + scores_max_boxes = worke_space + (seg * 4); + x1 = worke_space + data_len; + y1 = worke_space + (data_len + compute_len); + x2 = worke_space + (data_len + (compute_len * 2)); + y2 = worke_space + (data_len + (compute_len * 3)); + + data_len = data_len + (compute_len * 4); + + for (i = 0, j = 0; i < seg * 4; i+=4, j++) { + x1[j] = worke_space[i]; + y1[j] = worke_space[i + 1]; + x2[j] = worke_space[i + 2]; + y2[j] = worke_space[i + 3]; + } + w = worke_space + data_len; + h = worke_space + (data_len + compute_len); + area_ptr = worke_space + (data_len + (compute_len * 2)); + inter_area_ptr = worke_space + (data_len + (compute_len * 3)); + scores_max_boxes_area_ptr = worke_space + (data_len + (compute_len * 4)); + nms_thres_ptr = worke_space + (data_len + (compute_len * 5)); + scores_max_boxes_ptr = worke_space + (data_len + (compute_len * 6)); + tem = worke_space + (data_len + (compute_len * 7)); + result = (uint8_t*)worke_space + (data_len + (compute_len * 8)); + + // -----------------iou detect-------------------- + // fing all boxes area + __bang_sub(h, y1, y2, compute_len); + __bang_sub(w, x2, x1, compute_len); + __bang_mul(area_ptr, h, w, compute_len); + + // max x1 + __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[0]); + __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, compute_len, compute_len); + __bang_active_relu(x1, x1, compute_len); + __bang_cycle_add(x1, x1, scores_max_boxes_ptr, compute_len, compute_len); + + // min y1 + __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[1]); + __bang_write_zero(tem, compute_len); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len); + __bang_sub(tem, y1, scores_max_boxes_ptr, compute_len); + __bang_active_relu(tem, tem, compute_len); + __bang_sub(y1, y1, tem, compute_len); + + // min x2 + __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[2]); + __bang_write_zero(tem, compute_len); + __bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len); + __bang_sub(tem, x2, scores_max_boxes_ptr, compute_len); + __bang_active_relu(tem, tem, compute_len); + __bang_sub(x2, x2, tem, compute_len); + + // max y2 + __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[3]); + __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, compute_len, compute_len); + __bang_active_relu(y2, y2, compute_len); + __bang_cycle_add(y2, y2, scores_max_boxes_ptr, compute_len, compute_len); + + // --------- intesection------- + // fing W + __bang_sub(w, x2, x1, compute_len); + __bang_active_relu(w, w, compute_len); + + // find H + __bang_sub(h, y1, y2, compute_len); + __bang_active_relu(h, h, compute_len); + + // fing intersection + __bang_mul(inter_area_ptr, h, w, compute_len); + + // fing scores max boxes area + w_s = scores_max_boxes[2] - scores_max_boxes[0]; + h_s = scores_max_boxes[1] - scores_max_boxes[3]; + scores_max_boxes_area = w_s * h_s; + + __bang_write_value(scores_max_boxes_area_ptr, compute_len, + scores_max_boxes_area); + __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr, + compute_len, compute_len); + __bang_sub(tem, tem, inter_area_ptr, compute_len); + __bang_write_value(nms_thres_ptr, compute_len, nms_thres); + __bang_cycle_mul(tem, tem, nms_thres_ptr, compute_len, compute_len); + + __bang_le(tem, inter_area_ptr, tem, compute_len); + + for (int i = 0; i < seg; i++) { + if (tem[i]) { + result[i] = 1; + } else { + result[i] = 0; + } + } + __memcpy(output_boxes_index + offset, result, seg * sizeof(uint8_t), + NRAM2GDRAM); +} + +UNION_OP_KERNEL_IMPLE(MlNms, float, Fast); +UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); + +void MLUOP_WIN_API mluOpKernelMlNmsFloatFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, + int input_boxes_num, uint8_t* output_boxes_index) { + MLUOpKernelMlNmsfloatFast<<>>( + data_type, boxes_data_ptr, nms_thres, + input_boxes_num, output_boxes_index); +} + +void MLUOP_WIN_API mluOpKernelMlNmsHalfFast( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, + int input_boxes_num, uint8_t* output_boxes_index) { + MLUOpKernelMlNmshalfFast<<>>( + data_type, boxes_data_ptr, nms_thres, + input_boxes_num, output_boxes_index); +} From 21cd9aaf5ad54458adfb0bedf8ac99f5f98be433 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:53:38 +0800 Subject: [PATCH 18/36] Update mlu_op_test.proto --- .../pb_gtest/mlu_op_test_proto/mlu_op_test.proto | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto/mlu_op_test.proto b/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto/mlu_op_test.proto index b90f02a0e..9714f6b04 100755 --- a/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto/mlu_op_test.proto +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto/mlu_op_test.proto @@ -220,6 +220,7 @@ message Node { optional PsamaskForwardParam psamask_forward_param = 134658; // PsamaskForwardParam optional PsamaskBackwardParam psamask_backward_param = 134659; // PsamaskBackwardParam optional VoxelizationParam voxelization_param = 6564; // Voxelizationparam + optional MlNmsParam ml_nms_param = 4020; // MlNmsParam } @@ -418,3 +419,8 @@ message VoxelizationParam { optional int32 ndim = 3 [default = 3]; optional bool deterministic = 4 [default = true]; } + +// param to call mluOpMlNms() +message MlNmsParam { + required float iou_threshold = 1 [default = 0.2]; +} From a355d0e32057612c9f7a95efa8fe69621a50db6f Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 14:56:07 +0800 Subject: [PATCH 19/36] Create ml_nms.cpp --- .../pb_gtest/src/zoo/ml_nms/ml_nms.cpp | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.cpp diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.cpp new file mode 100644 index 000000000..c71d3b2f7 --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.cpp @@ -0,0 +1,103 @@ +/******************************************************************************* +* Copyright (C) [2022] by Cambricon, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*******************************************************************************/ +#include "ml_nms.h" +#include +#include + +namespace mluoptest { + +void MlNmsExecutor::paramCheck() { + if (!parser_->getProtoNode()->has_ml_nms_param()) { + LOG(ERROR) << "Lose ml_nms_param. "; + } + GTEST_CHECK(parser_->inputs().size() == 1, + "[MlNmsExecutor] input number is wrong. "); + GTEST_CHECK(parser_->outputs().size() == 1, + "[MlNmsExecutor] output number is wrong. "); +} + +void MlNmsExecutor::compute() { + float iou_threshold = + parser_->getProtoNode()->ml_nms_param().iou_threshold(); + VLOG(4) << "[mluMlNms] iou_threshold: " << iou_threshold; + // get tensor by name (in prototxt) + auto boxes_desc = parser_->getMetaTensor("input").tensor; + auto output_desc = parser_->getMetaTensor("output").tensor; + auto boxes_ptr = parser_->getMetaTensor("input").dev_ptr; + auto output_ptr = parser_->getMetaTensor("output").dev_ptr; + interface_timer_.start(); + + VLOG(4) << "[mluOpMlNms] call mluOpMlNms()"; + MLUOP_CHECK(mluOpMlNms(handle_, boxes_desc, boxes_ptr, + iou_threshold, (uint8_t*)output_ptr)); + interface_timer_.stop(); + VLOG(4) << "[mluOpMlNms] mluOpMlNms end."; +} + +static float iouCompute(std::vector box1, std::vector box2) { + float x1 = std::max(box1[0], box2[0]); + float y1 = std::min(box1[1], box2[1]); + float x2 = std::min(box1[2], box2[2]); + float y2 = std::max(box1[3], box2[3]); + + float area1 = abs(box1[0] - box1[2]) * abs(box1[1] - box1[3]); + float area2 = abs(box2[0] - box2[2]) * abs(box2[1] - box2[3]); + float inter = abs(x1 - x2) * abs(y1 - y2); + + float iou = inter / (area1 + area2 - inter); + + return iou; +} + +void MlNmsExecutor::cpuCompute() { + float iou_threshold = + parser_->getProtoNode()->ml_nms_param().iou_threshold(); + VLOG(4) << "mluMlNms iou_threshold:" << iou_threshold; + auto input_desc = tensor_desc_[0].tensor; + auto boxes_ptr = parser_->getMetaTensor(0).cpu_ptr; + auto output_ptr = parser_->getMetaTensor(1).cpu_ptr; + int input_boxes_num = input_desc->dims[0]; + std::vector> boxes_data_ptr; + for (int i = 0; i < input_boxes_num * 4; i+=4) { + std::vector data_ptr; + for (int j = 0; j < 4; j++) { + data_ptr.push_back(boxes_ptr[j + i]); + } + boxes_data_ptr.push_back(data_ptr); + } + for (int i = 0; i < input_boxes_num ; i++) { + float iou = iouCompute(boxes_data_ptr[0], boxes_data_ptr[i]); + if (iou <= iou_threshold) { + output_ptr[i] = 1; + } else { + output_ptr[i] = 0; + } + } +} + +int64_t MlNmsExecutor::getTheoryOps() { + int64_t theory_ops = parser_->input(0)->total_count; + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} +} // namespace mluoptest From 6cef39d489244a69fa9e0d6fcf3aac782c95d68c Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 15:05:06 +0800 Subject: [PATCH 20/36] Create ml_nms.h --- .../pb_gtest/src/zoo/ml_nms/ml_nms.h | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.h diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.h new file mode 100644 index 000000000..57ac0193b --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/ml_nms.h @@ -0,0 +1,41 @@ +/************************************************************************* + * Copyright (C) [2022] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef TEST_MLU_OP_SRC_ZOO_ML_NMS_ML_NMS_H_ +#define TEST_MLU_OP_SRC_ZOO_ML_NMS_ML_NMS_H_ +#include "executor.h" + +namespace mluoptest { + +class MlNmsExecutor : public Executor { + public: + MlNmsExecutor() {} + ~MlNmsExecutor() {} + + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; +}; + +} // namespace mluoptest +#endif // TEST_MLU_OP_SRC_ZOO_ML_NMS_ML_NMS_H_ From 565840de5ee8f51336ac0a6a9f9abb5c5812782c Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 15:08:32 +0800 Subject: [PATCH 21/36] Create case_0.prototxt --- .../src/zoo/ml_nms/test_case/case_0.prototxt | 286 ++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt new file mode 100644 index 000000000..616aa59dd --- /dev/null +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt @@ -0,0 +1,286 @@ +op_name: "ml_nms" +input { + id: "input" + shape { + dims: 64 + dims: 4 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + value_f: 100.00 + value_f: 200.00 + value_f: 200.00 + value_f: 100.00 + value_f: 105.00 + value_f: 200.00 + value_f: 205.00 + value_f: 100.00 + value_f: 110.00 + value_f: 200.00 + value_f: 210.00 + value_f: 100.00 + value_f: 115.00 + value_f: 200.00 + value_f: 215.00 + value_f: 100.00 + value_f: 120.00 + value_f: 200.00 + value_f: 220.00 + value_f: 100.00 + value_f: 125.00 + value_f: 200.00 + value_f: 225.00 + value_f: 100.00 + value_f: 130.00 + value_f: 200.00 + value_f: 230.00 + value_f: 100.00 + value_f: 135.00 + value_f: 200.00 + value_f: 235.00 + value_f: 100.00 + value_f: 140.00 + value_f: 200.00 + value_f: 240.00 + value_f: 100.00 + value_f: 145.00 + value_f: 200.00 + value_f: 245.00 + value_f: 100.00 + value_f: 150.00 + value_f: 200.00 + value_f: 250.00 + value_f: 100.00 + value_f: 155.00 + value_f: 200.00 + value_f: 255.00 + value_f: 100.00 + value_f: 160.00 + value_f: 200.00 + value_f: 260.00 + value_f: 100.00 + value_f: 165.00 + value_f: 200.00 + value_f: 265.00 + value_f: 100.00 + value_f: 170.00 + value_f: 200.00 + value_f: 270.00 + value_f: 100.00 + value_f: 175.00 + value_f: 200.00 + value_f: 275.00 + value_f: 100.00 + value_f: 180.00 + value_f: 200.00 + value_f: 280.00 + value_f: 100.00 + value_f: 185.00 + value_f: 200.00 + value_f: 285.00 + value_f: 100.00 + value_f: 190.00 + value_f: 200.00 + value_f: 290.00 + value_f: 100.00 + value_f: 195.00 + value_f: 200.00 + value_f: 295.00 + value_f: 100.00 + value_f: 100.00 + value_f: 205.00 + value_f: 200.00 + value_f: 105.00 + value_f: 100.00 + value_f: 210.00 + value_f: 200.00 + value_f: 110.00 + value_f: 100.00 + value_f: 215.00 + value_f: 200.00 + value_f: 115.00 + value_f: 100.00 + value_f: 220.00 + value_f: 200.00 + value_f: 120.00 + value_f: 100.00 + value_f: 225.00 + value_f: 200.00 + value_f: 125.00 + value_f: 100.00 + value_f: 230.00 + value_f: 200.00 + value_f: 130.00 + value_f: 100.00 + value_f: 235.00 + value_f: 200.00 + value_f: 135.00 + value_f: 100.00 + value_f: 240.00 + value_f: 200.00 + value_f: 140.00 + value_f: 100.00 + value_f: 245.00 + value_f: 200.00 + value_f: 145.00 + value_f: 100.00 + value_f: 250.00 + value_f: 200.00 + value_f: 150.00 + value_f: 100.00 + value_f: 255.00 + value_f: 200.00 + value_f: 155.00 + value_f: 100.00 + value_f: 260.00 + value_f: 200.00 + value_f: 160.00 + value_f: 100.00 + value_f: 265.00 + value_f: 200.00 + value_f: 165.00 + value_f: 100.00 + value_f: 270.00 + value_f: 200.00 + value_f: 170.00 + value_f: 100.00 + value_f: 275.00 + value_f: 200.00 + value_f: 175.00 + value_f: 100.00 + value_f: 280.00 + value_f: 200.00 + value_f: 180.00 + value_f: 100.00 + value_f: 285.00 + value_f: 200.00 + value_f: 185.00 + value_f: 100.00 + value_f: 290.00 + value_f: 200.00 + value_f: 190.00 + value_f: 100.00 + value_f: 295.00 + value_f: 200.00 + value_f: 195.00 + value_f: 100.00 + value_f: 300.00 + value_f: 200.00 + value_f: 200.00 + value_f: 105.00 + value_f: 195.00 + value_f: 205.00 + value_f: 95.00 + value_f: 110.00 + value_f: 190.00 + value_f: 210.00 + value_f: 90.00 + value_f: 115.00 + value_f: 185.00 + value_f: 215.00 + value_f: 85.00 + value_f: 120.00 + value_f: 180.00 + value_f: 220.00 + value_f: 80.00 + value_f: 125.00 + value_f: 175.00 + value_f: 225.00 + value_f: 75.00 + value_f: 130.00 + value_f: 170.00 + value_f: 230.00 + value_f: 70.00 + value_f: 135.00 + value_f: 165.00 + value_f: 235.00 + value_f: 65.00 + value_f: 140.00 + value_f: 160.00 + value_f: 240.00 + value_f: 60.00 + value_f: 145.00 + value_f: 155.00 + value_f: 245.00 + value_f: 55.00 + value_f: 150.00 + value_f: 150.00 + value_f: 250.00 + value_f: 50.00 + value_f: 155.00 + value_f: 145.00 + value_f: 255.00 + value_f: 45.00 + value_f: 160.00 + value_f: 140.00 + value_f: 260.00 + value_f: 40.00 + value_f: 165.00 + value_f: 135.00 + value_f: 265.00 + value_f: 35.00 + value_f: 170.00 + value_f: 130.00 + value_f: 270.00 + value_f: 30.00 + value_f: 175.00 + value_f: 125.00 + value_f: 275.00 + value_f: 25.00 + value_f: 180.00 + value_f: 120.00 + value_f: 280.00 + value_f: 20.00 + value_f: 185.00 + value_f: 115.00 + value_f: 285.00 + value_f: 15.00 + value_f: 190.00 + value_f: 110.00 + value_f: 290.00 + value_f: 10.00 + value_f: 195.00 + value_f: 105.00 + value_f: 295.00 + value_f: 5.00 + value_f: 200.00 + value_f: 100.00 + value_f: 300.00 + value_f: 0.00 + value_f: 101.00 + value_f: 200.00 + value_f: 201.00 + value_f: 100.00 + value_f: 108.00 + value_f: 200.00 + value_f: 208.00 + value_f: 100.00 + value_f: 127.00 + value_f: 200.00 + value_f: 227.00 + value_f: 100.00 + value_f: 137.00 + value_f: 200.00 + value_f: 237.00 + value_f: 100.00 +} +output { + id: "output" + shape: { + dims: 64 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_INT8 +} +ml_nms_param: { + iou_threshold: 0.5 +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_func: DIFF3 + error_threshold: 0.0 + error_threshold: 0.0 + error_threshold: 0.0 + baseline_device: CPU +} From f54a8b678df301dae609e340ce3305bf4f305dc7 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 15:51:56 +0800 Subject: [PATCH 22/36] Update ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index 467747ac9..dba3cd107 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -31,6 +31,7 @@ #include "mlu_op_kernel.h" #include "cnrt.h" #include "cndev.h" +#include "ml_nms.h" static inline bool isSupportType(const mluOpDataType_t check_type, const mluOpDataType_t support_type[], From 7ba8626275e04462de10e1af163f321c129b843f Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 15:53:43 +0800 Subject: [PATCH 23/36] Update ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index dba3cd107..8fc595f53 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -45,9 +45,9 @@ static inline bool isSupportType(const mluOpDataType_t check_type, } mluOpStatus_t MlNmsParamCheck( - const std::string &op_name, const mluOpHandle_t &handle, - const mluOpTensorDescriptor_t &x_desc, const void *x, - const mluOpDataType_t support_type[], const int &len) { + const std::string &op_name, const mluOpHandle_t &handle, + const mluOpTensorDescriptor_t &x_desc, const void *x, + const mluOpDataType_t support_type[], const int &len) { PARAM_CHECK(op_name, x_desc != NULL); PARAM_CHECK(op_name, handle != NULL); From 90fa38e9e4eb17b2de25774425991d7b5b8870be Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:06:21 +0800 Subject: [PATCH 24/36] Update ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index 8fc595f53..094a61748 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -21,6 +21,7 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ************************************************************************/ #include +#include #include "mlu_op.h" #include "core/context.h" #include "core/gen_case.h" From 32191ddfb5c06cee6841339915658d5f91e3f6c4 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:13:05 +0800 Subject: [PATCH 25/36] Update mlu_op.h --- bangc-ops/mlu_op.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bangc-ops/mlu_op.h b/bangc-ops/mlu_op.h index 342ede63b..a91f9204d 100755 --- a/bangc-ops/mlu_op.h +++ b/bangc-ops/mlu_op.h @@ -1484,10 +1484,10 @@ mluOpGetTensorAndDataFromTensorSet(mluOpTensorSetDescriptor_t tensorSetDesc, mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, - const mluOpTensorDescriptor_t boxes_data_ptr_desc, - void* boxes_data_ptr, - float iou_threshold, - void* output_boxes_index); + const mluOpTensorDescriptor_t boxes_data_ptr_desc, + void* boxes_data_ptr, + float iou_threshold, + void* output_boxes_index); // Group:Abs /*! * @brief Computes the absolute value for every element of the input tensor \b x From d5d6be6c5ff2a4fd5ecc1951bb81fc5483bd002e Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:15:38 +0800 Subject: [PATCH 26/36] Update ml_nms.h --- bangc-ops/kernels/ml_nms/ml_nms.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.h b/bangc-ops/kernels/ml_nms/ml_nms.h index 2aa63f7e9..92b01962c 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.h +++ b/bangc-ops/kernels/ml_nms/ml_nms.h @@ -44,8 +44,14 @@ template __mlu_device__ void unionImple(T* boxes_data_ptr, T nms_thres, int offset, int seg, int input_boxes_num, uint8_t* output_boxes_index) { __nram__ char worke_space[MAX_NRAM_SIZE / 16]; - __memcpy((T*)worke_space, boxes_data_ptr + (offset * 4), seg * 4 * sizeof(T), GDRAM2NRAM); - __memcpy((T*)worke_space + (seg * 4), boxes_data_ptr, 4 * sizeof(T), GDRAM2NRAM); + __memcpy((T*)worke_space, + boxes_data_ptr + (offset * 4), + seg * 4 * sizeof(T), + GDRAM2NRAM); + __memcpy((T*)worke_space + (seg * 4), + boxes_data_ptr, + 4 * sizeof(T), + GDRAM2NRAM); OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset, seg, output_boxes_index); } From b826fd27ebfa0ccb3aa5e8b3603aea15842bcaa5 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:19:26 +0800 Subject: [PATCH 27/36] Update mlu_op.h --- bangc-ops/mlu_op.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bangc-ops/mlu_op.h b/bangc-ops/mlu_op.h index a91f9204d..841e36edd 100755 --- a/bangc-ops/mlu_op.h +++ b/bangc-ops/mlu_op.h @@ -1484,10 +1484,10 @@ mluOpGetTensorAndDataFromTensorSet(mluOpTensorSetDescriptor_t tensorSetDesc, mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, - const mluOpTensorDescriptor_t boxes_data_ptr_desc, - void* boxes_data_ptr, - float iou_threshold, - void* output_boxes_index); + const mluOpTensorDescriptor_t boxes_data_ptr_desc, + void* boxes_data_ptr, + float iou_threshold, + void* output_boxes_index); // Group:Abs /*! * @brief Computes the absolute value for every element of the input tensor \b x From d5ade705509923ba98ce40923eed2c0ae3b0b60c Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:32:44 +0800 Subject: [PATCH 28/36] Create ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index 094a61748..8dd7fe221 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -32,7 +32,6 @@ #include "mlu_op_kernel.h" #include "cnrt.h" #include "cndev.h" -#include "ml_nms.h" static inline bool isSupportType(const mluOpDataType_t check_type, const mluOpDataType_t support_type[], From bffdca67f3b5f501dc8964809e27571387ffce34 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:35:23 +0800 Subject: [PATCH 29/36] Update ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index 8dd7fe221..26a1b6f1b 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -22,7 +22,6 @@ ************************************************************************/ #include #include -#include "mlu_op.h" #include "core/context.h" #include "core/gen_case.h" #include "core/logging.h" @@ -30,6 +29,8 @@ #include "core/tensor.h" #include "core/type.h" #include "mlu_op_kernel.h" +#include "mlu_op.h" +#include "ml_nms.h" #include "cnrt.h" #include "cndev.h" From 445333db94ef2c26329037ddd774eb4456906980 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:56:13 +0800 Subject: [PATCH 30/36] Update ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index 26a1b6f1b..41f1a8b5e 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -30,7 +30,6 @@ #include "core/type.h" #include "mlu_op_kernel.h" #include "mlu_op.h" -#include "ml_nms.h" #include "cnrt.h" #include "cndev.h" From be55b037e2a7ec07d6d75ba46b3d1eab70520d69 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 17:08:10 +0800 Subject: [PATCH 31/36] Update ml_nms.mlu --- bangc-ops/kernels/ml_nms/ml_nms.mlu | 35 ++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.mlu b/bangc-ops/kernels/ml_nms/ml_nms.mlu index b9f2aca51..996d96347 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.mlu +++ b/bangc-ops/kernels/ml_nms/ml_nms.mlu @@ -22,7 +22,40 @@ ************************************************************************/ #include "bang.h" #include "mlu_op_kernel.h" -#include "ml_nms.h" +#include "kernels/kernel.h" + +#define NRAM_SIZE 2 * 1024 +#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ + __mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\ + mluOpDataType_t data_type, void* boxes_data_ptr, \ + float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ + +#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ + __mlu_global__ void MLUOpKernel##Op##DType##Prefer( \ + mluOpDataType_t data_type, void* boxes_data_ptr, \ + float nms_thres, int input_boxes_num, uint8_t* output_boxes_index) {\ + int offset, seg; \ + getOffsetNum##Op##Prefer(input_boxes_num, &offset); \ + getSegNumMlNmsFast(input_boxes_num, &seg); \ + unionImple( \ + (DType*)boxes_data_ptr, (DType)nms_thres, \ + offset, seg, input_boxes_num, output_boxes_index);} + +template +__mlu_device__ void unionImple(T* boxes_data_ptr, T nms_thres, int offset, + int seg, int input_boxes_num, uint8_t* output_boxes_index) { + __nram__ char worke_space[MAX_NRAM_SIZE / 16]; + __memcpy((T*)worke_space, + boxes_data_ptr + (offset * 4), + seg * 4 * sizeof(T), + GDRAM2NRAM); + __memcpy((T*)worke_space + (seg * 4), + boxes_data_ptr, + 4 * sizeof(T), + GDRAM2NRAM); + OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset, + seg, output_boxes_index); +} __mlu_func__ void getComputeLen(int seg, int elem_byte, int* compute_len) { #if (__BANG_ARCH__ < 200) From 9436e06035ab3849f867c97c176b049b0086c87b Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+Jones154@users.noreply.github.com> Date: Tue, 13 Dec 2022 17:08:24 +0800 Subject: [PATCH 32/36] Delete ml_nms.h --- bangc-ops/kernels/ml_nms/ml_nms.h | 59 ------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 bangc-ops/kernels/ml_nms/ml_nms.h diff --git a/bangc-ops/kernels/ml_nms/ml_nms.h b/bangc-ops/kernels/ml_nms/ml_nms.h deleted file mode 100644 index 92b01962c..000000000 --- a/bangc-ops/kernels/ml_nms/ml_nms.h +++ /dev/null @@ -1,59 +0,0 @@ -/************************************************************************* -* Copyright (C) [2022] by Cambricon, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the -* "Software"), to deal in the Software without restriction, including -* without limitation the rights to use, copy, modify, merge, publish, -* distribute, sublicense, and/or sell copies of the Software, and to -* permit persons to whom the Software is furnished to do so, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ************************************************************************/ -#ifndef UNARY_OP_BLOCK_H_ -#define UNARY_OP_BLOCK_H_ -#include "kernels/kernel.h" -#define NRAM_SIZE 2 * 1024 -#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ - __mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\ - mluOpDataType_t data_type, void* boxes_data_ptr, \ - float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ - -#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ - __mlu_global__ void MLUOpKernel##Op##DType##Prefer( \ - mluOpDataType_t data_type, void* boxes_data_ptr, \ - float nms_thres, int input_boxes_num, uint8_t* output_boxes_index) {\ - int offset, seg; \ - getOffsetNum##Op##Prefer(input_boxes_num, &offset); \ - getSegNumMlNmsFast(input_boxes_num, &seg); \ - unionImple( \ - (DType*)boxes_data_ptr, (DType)nms_thres, \ - offset, seg, input_boxes_num, output_boxes_index);} - -template -__mlu_device__ void unionImple(T* boxes_data_ptr, T nms_thres, int offset, - int seg, int input_boxes_num, uint8_t* output_boxes_index) { - __nram__ char worke_space[MAX_NRAM_SIZE / 16]; - __memcpy((T*)worke_space, - boxes_data_ptr + (offset * 4), - seg * 4 * sizeof(T), - GDRAM2NRAM); - __memcpy((T*)worke_space + (seg * 4), - boxes_data_ptr, - 4 * sizeof(T), - GDRAM2NRAM); - OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset, - seg, output_boxes_index); -} - -#endif From 14762969b9a8ae6488aeef732ef143c27935ac85 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+wenzhengyin@users.noreply.github.com> Date: Fri, 3 Feb 2023 14:02:24 +0800 Subject: [PATCH 33/36] Update mlu_op_kernel.h --- bangc-ops/mlu_op_kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bangc-ops/mlu_op_kernel.h b/bangc-ops/mlu_op_kernel.h index d5ec76aa6..58be8e84c 100644 --- a/bangc-ops/mlu_op_kernel.h +++ b/bangc-ops/mlu_op_kernel.h @@ -43,12 +43,12 @@ extern "C" { void MLUOP_WIN_API mluOpKernelMlNmsFloatFast(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, int input_boxes_num, - uint8_t* output_boxes_index); + int boxes_start_position, uint8_t* output_boxes_index); void MLUOP_WIN_API mluOpKernelMlNmsHalfFast(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, int input_boxes_num, - uint8_t* output_boxes_index); + int boxes_start_position, uint8_t* output_boxes_index); /* Abs */ void MLUOP_WIN_API mluOpBlockKernel3StagePipelineAbsHalfFast( From bb457d78f4657616fa549bcfaa227602da0bedf4 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+wenzhengyin@users.noreply.github.com> Date: Fri, 3 Feb 2023 14:04:32 +0800 Subject: [PATCH 34/36] Update ml_nms.cpp --- bangc-ops/kernels/ml_nms/ml_nms.cpp | 51 ++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp index 41f1a8b5e..132840513 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.cpp +++ b/bangc-ops/kernels/ml_nms/ml_nms.cpp @@ -28,6 +28,7 @@ #include "core/runtime/device.h" #include "core/tensor.h" #include "core/type.h" +#include "kernels/kernel.h" #include "mlu_op_kernel.h" #include "mlu_op.h" #include "cnrt.h" @@ -91,21 +92,49 @@ mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle, cnrtDim3_t k_dim; cnrtFunctionType_t k_type; policyFunc(handle, boxes_data_ptr_desc, &k_dim, &k_type); - int input_boxes_num = boxes_data_ptr_desc->total_element_num / 4; + int input_boxes_num = boxes_data_ptr_desc->total_element_num / 6; + int apply_nram_size = 0; + int boxes_start_position = 0; + int loop_num = 0; void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t data_type, void* boxes_data_ptr, - float nmsThres, int input_boxes_num, uint8_t* output_boxes_index); + float nmsThres, int input_boxes_num, int boxes_start_position, + uint8_t* output_boxes_index); - if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF) { - mluOpFuncKernel = mluOpKernelMlNmsHalfFast; - } else { - mluOpFuncKernel = mluOpKernelMlNmsFloatFast; + if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF) { + mluOpFuncKernel = mluOpKernelMlNmsHalfFast; + apply_nram_size = (input_boxes_num * 6 * 2) + (input_boxes_num * 14 * 2); + } else { + mluOpFuncKernel = mluOpKernelMlNmsFloatFast; + apply_nram_size = (input_boxes_num * 6 * 4) + (input_boxes_num * 14 * 4); + } + if (apply_nram_size > MAX_NRAM_SIZE) { + if ((apply_nram_size % MAX_NRAM_SIZE) !=0) { + loop_num = (apply_nram_size / MAX_NRAM_SIZE) + 1; + } else { + loop_num = apply_nram_size / MAX_NRAM_SIZE; } - - KERNEL_CHECK( - (mluOpFuncKernel(k_dim, k_type, handle->queue, - boxes_data_ptr_desc->dtype, boxes_data_ptr, - iou_threshold, input_boxes_num, (uint8_t*)output_boxes_index))); + } + if (loop_num > 0) { + for (int i = 0; i < loop_num; i++) { + boxes_start_position = i * (input_boxes_num / loop_num); + KERNEL_CHECK((mluOpFuncKernel(k_dim, k_type, handle->queue, + boxes_data_ptr_desc->dtype, + boxes_data_ptr, + iou_threshold, + input_boxes_num, + boxes_start_position, + (uint8_t*)output_boxes_index))); + } + } else { + KERNEL_CHECK((mluOpFuncKernel(k_dim, k_type, handle->queue, + boxes_data_ptr_desc->dtype, + boxes_data_ptr, + iou_threshold, + input_boxes_num, + boxes_start_position, + (uint8_t*)output_boxes_index))); + } GEN_CASE_END(); return MLUOP_STATUS_SUCCESS; From 7d0f80470ee02cb69a0236cecc7abe3fe2658f2f Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+wenzhengyin@users.noreply.github.com> Date: Fri, 3 Feb 2023 14:05:25 +0800 Subject: [PATCH 35/36] Update ml_nms.mlu --- bangc-ops/kernels/ml_nms/ml_nms.mlu | 78 ++++++++++++++++++----------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/bangc-ops/kernels/ml_nms/ml_nms.mlu b/bangc-ops/kernels/ml_nms/ml_nms.mlu index 996d96347..ee0fbcc4c 100644 --- a/bangc-ops/kernels/ml_nms/ml_nms.mlu +++ b/bangc-ops/kernels/ml_nms/ml_nms.mlu @@ -26,32 +26,35 @@ #define NRAM_SIZE 2 * 1024 #define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer) \ - __mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\ + __mlu_global__ void MLUBlockKernel##Op##DType##Prefer( \ mluOpDataType_t data_type, void* boxes_data_ptr, \ - float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\ + float nms_thres, int input_boxes_num, int boxes_start_position, \ + uint8_t* output_boxes_index); \ #define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer) \ __mlu_global__ void MLUOpKernel##Op##DType##Prefer( \ mluOpDataType_t data_type, void* boxes_data_ptr, \ - float nms_thres, int input_boxes_num, uint8_t* output_boxes_index) {\ + float nms_thres, int input_boxes_num, int boxes_start_position, \ + uint8_t* output_boxes_index) { \ int offset, seg; \ getOffsetNum##Op##Prefer(input_boxes_num, &offset); \ getSegNumMlNmsFast(input_boxes_num, &seg); \ unionImple( \ (DType*)boxes_data_ptr, (DType)nms_thres, \ - offset, seg, input_boxes_num, output_boxes_index);} + offset, seg, input_boxes_num, boxes_start_position, output_boxes_index);} template -__mlu_device__ void unionImple(T* boxes_data_ptr, T nms_thres, int offset, - int seg, int input_boxes_num, uint8_t* output_boxes_index) { +__mlu_device__ void unionImple(T* boxes_data_ptr, + T nms_thres, int offset, int seg, int input_boxes_num, + int boxes_start_position, uint8_t* output_boxes_index) { __nram__ char worke_space[MAX_NRAM_SIZE / 16]; __memcpy((T*)worke_space, - boxes_data_ptr + (offset * 4), - seg * 4 * sizeof(T), + boxes_data_ptr + ((boxes_start_position + offset) * 6), + seg * 6 * sizeof(T), GDRAM2NRAM); - __memcpy((T*)worke_space + (seg * 4), + __memcpy((T*)worke_space + (seg * 6), boxes_data_ptr, - 4 * sizeof(T), + 6 * sizeof(T), GDRAM2NRAM); OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset, seg, output_boxes_index); @@ -106,26 +109,20 @@ __mlu_func__ void computeMlNmsFast(T* worke_space, __nram__ T* nms_thres_ptr; __nram__ T* scores_max_boxes_ptr; __nram__ T* tem; + __nram__ uint8_t* similar_index; __nram__ uint8_t* result; int compute_len; int i, j; - int data_len = seg * 4 + 4; + int data_len = seg * 6 + 6; + // ----------------------allocate memory--------------------- getComputeLen(seg, sizeof(T), &compute_len); - scores_max_boxes = worke_space + (seg * 4); + scores_max_boxes = worke_space + (seg * 6); x1 = worke_space + data_len; y1 = worke_space + (data_len + compute_len); x2 = worke_space + (data_len + (compute_len * 2)); y2 = worke_space + (data_len + (compute_len * 3)); - data_len = data_len + (compute_len * 4); - - for (i = 0, j = 0; i < seg * 4; i+=4, j++) { - x1[j] = worke_space[i]; - y1[j] = worke_space[i + 1]; - x2[j] = worke_space[i + 2]; - y2[j] = worke_space[i + 3]; - } w = worke_space + data_len; h = worke_space + (data_len + compute_len); area_ptr = worke_space + (data_len + (compute_len * 2)); @@ -134,7 +131,30 @@ __mlu_func__ void computeMlNmsFast(T* worke_space, nms_thres_ptr = worke_space + (data_len + (compute_len * 5)); scores_max_boxes_ptr = worke_space + (data_len + (compute_len * 6)); tem = worke_space + (data_len + (compute_len * 7)); - result = (uint8_t*)worke_space + (data_len + (compute_len * 8)); + if (sizeof(T) == sizeof(uint8_t)) { + similar_index = (uint8_t*)worke_space + (data_len + (compute_len * 8)); + result = (uint8_t*)worke_space + (data_len + (compute_len * 8) + seg); + } else { + similar_index = (uint8_t*)worke_space + ((data_len + (compute_len * 8)) * + (sizeof(T) / sizeof(uint8_t))); + result = (uint8_t*)worke_space + ((data_len + (compute_len * 8)) * + (sizeof(T) / sizeof(uint8_t)) + seg); + } + for (i = 0, j = 0; i < seg * 6; i+=6, j++) { + if (*(scores_max_boxes + 5) == worke_space[i + 5]) { + similar_index[j] = 1; + x1[j] = worke_space[i]; + y1[j] = worke_space[i + 1]; + x2[j] = worke_space[i + 2]; + y2[j] = worke_space[i + 3]; + } else { + similar_index[j] = 0; + x1[j] = 0.0; + y1[j] = 0.0; + x2[j] = 0.0; + y2[j] = 0.0; + } + } // -----------------iou detect-------------------- // fing all boxes area @@ -198,7 +218,7 @@ __mlu_func__ void computeMlNmsFast(T* worke_space, __bang_le(tem, inter_area_ptr, tem, compute_len); for (int i = 0; i < seg; i++) { - if (tem[i]) { + if (tem[i] && similar_index[i]) { result[i] = 1; } else { result[i] = 0; @@ -213,18 +233,20 @@ UNION_OP_KERNEL_IMPLE(MlNms, half, Fast); void MLUOP_WIN_API mluOpKernelMlNmsFloatFast( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, - int input_boxes_num, uint8_t* output_boxes_index) { + mluOpDataType_t data_type, void* boxes_data_ptr, + float nms_thres, int input_boxes_num, int boxes_start_position, + uint8_t* output_boxes_index) { MLUOpKernelMlNmsfloatFast<<>>( data_type, boxes_data_ptr, nms_thres, - input_boxes_num, output_boxes_index); + input_boxes_num, boxes_start_position, output_boxes_index); } void MLUOP_WIN_API mluOpKernelMlNmsHalfFast( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres, - int input_boxes_num, uint8_t* output_boxes_index) { + mluOpDataType_t data_type, void* boxes_data_ptr, + float nms_thres, int input_boxes_num, int boxes_start_position, + uint8_t* output_boxes_index) { MLUOpKernelMlNmshalfFast<<>>( data_type, boxes_data_ptr, nms_thres, - input_boxes_num, output_boxes_index); + input_boxes_num, boxes_start_position, output_boxes_index); } From 512d7b034b45caddb5b9b8c22467da9795576909 Mon Sep 17 00:00:00 2001 From: wenzhengyin <71548662+wenzhengyin@users.noreply.github.com> Date: Fri, 3 Feb 2023 15:43:45 +0800 Subject: [PATCH 36/36] Update case_0.prototxt --- .../src/zoo/ml_nms/test_case/case_0.prototxt | 130 +++++++++++++++++- 1 file changed, 129 insertions(+), 1 deletion(-) diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt index 616aa59dd..3f416f48b 100644 --- a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/ml_nms/test_case/case_0.prototxt @@ -3,266 +3,394 @@ input { id: "input" shape { dims: 64 - dims: 4 + dims: 6 } layout: LAYOUT_ARRAY dtype: DTYPE_FLOAT + value_f:0 value_f: 100.00 value_f: 200.00 value_f: 200.00 value_f: 100.00 + value_f:0.70 + value_f:0 value_f: 105.00 value_f: 200.00 value_f: 205.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 110.00 value_f: 200.00 value_f: 210.00 value_f: 100.00 + value_f:0.60 + value_f:0 value_f: 115.00 value_f: 200.00 value_f: 215.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 120.00 value_f: 200.00 value_f: 220.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 125.00 value_f: 200.00 value_f: 225.00 value_f: 100.00 + value_f:0.80 + value_f:0 value_f: 130.00 value_f: 200.00 value_f: 230.00 value_f: 100.00 + value_f:0.70 + value_f:0 value_f: 135.00 value_f: 200.00 value_f: 235.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 140.00 value_f: 200.00 value_f: 240.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 145.00 value_f: 200.00 value_f: 245.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 150.00 value_f: 200.00 value_f: 250.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 155.00 value_f: 200.00 value_f: 255.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 160.00 value_f: 200.00 value_f: 260.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 165.00 value_f: 200.00 value_f: 265.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 170.00 value_f: 200.00 value_f: 270.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 175.00 value_f: 200.00 value_f: 275.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 180.00 value_f: 200.00 value_f: 280.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 185.00 value_f: 200.00 value_f: 285.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 190.00 value_f: 200.00 value_f: 290.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 195.00 value_f: 200.00 value_f: 295.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 205.00 value_f: 200.00 value_f: 105.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 210.00 value_f: 200.00 value_f: 110.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 215.00 value_f: 200.00 value_f: 115.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 220.00 value_f: 200.00 value_f: 120.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 225.00 value_f: 200.00 value_f: 125.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 230.00 value_f: 200.00 value_f: 130.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 235.00 value_f: 200.00 value_f: 135.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 240.00 value_f: 200.00 value_f: 140.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 245.00 value_f: 200.00 value_f: 145.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 250.00 value_f: 200.00 value_f: 150.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 255.00 value_f: 200.00 value_f: 155.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 260.00 value_f: 200.00 value_f: 160.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 265.00 value_f: 200.00 value_f: 165.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 270.00 value_f: 200.00 value_f: 170.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 275.00 value_f: 200.00 value_f: 175.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 280.00 value_f: 200.00 value_f: 180.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 285.00 value_f: 200.00 value_f: 185.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 290.00 value_f: 200.00 value_f: 190.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 295.00 value_f: 200.00 value_f: 195.00 + value_f:0.90 + value_f:0 value_f: 100.00 value_f: 300.00 value_f: 200.00 value_f: 200.00 + value_f:0.90 + value_f:0 value_f: 105.00 value_f: 195.00 value_f: 205.00 value_f: 95.00 + value_f:0.90 + value_f:0 value_f: 110.00 value_f: 190.00 value_f: 210.00 value_f: 90.00 + value_f:0.90 + value_f:0 value_f: 115.00 value_f: 185.00 value_f: 215.00 value_f: 85.00 + value_f:0.90 + value_f:0 value_f: 120.00 value_f: 180.00 value_f: 220.00 value_f: 80.00 + value_f:0.90 + value_f:0 value_f: 125.00 value_f: 175.00 value_f: 225.00 value_f: 75.00 + value_f:0.90 + value_f:0 value_f: 130.00 value_f: 170.00 value_f: 230.00 value_f: 70.00 + value_f:0.90 + value_f:0 value_f: 135.00 value_f: 165.00 value_f: 235.00 value_f: 65.00 + value_f:0.90 + value_f:0 value_f: 140.00 value_f: 160.00 value_f: 240.00 value_f: 60.00 + value_f:0.90 + value_f:0 value_f: 145.00 value_f: 155.00 value_f: 245.00 value_f: 55.00 + value_f:0.90 + value_f:0 value_f: 150.00 value_f: 150.00 value_f: 250.00 value_f: 50.00 + value_f:0.90 + value_f:0 value_f: 155.00 value_f: 145.00 value_f: 255.00 value_f: 45.00 + value_f:0.90 + value_f:0 value_f: 160.00 value_f: 140.00 value_f: 260.00 value_f: 40.00 + value_f:0.90 + value_f:0 value_f: 165.00 value_f: 135.00 value_f: 265.00 value_f: 35.00 + value_f:0.90 + value_f:0 value_f: 170.00 value_f: 130.00 value_f: 270.00 value_f: 30.00 + value_f:0.90 + value_f:0 value_f: 175.00 value_f: 125.00 value_f: 275.00 value_f: 25.00 + value_f:0.90 + value_f:0 value_f: 180.00 value_f: 120.00 value_f: 280.00 value_f: 20.00 + value_f:0.90 + value_f:0 value_f: 185.00 value_f: 115.00 value_f: 285.00 value_f: 15.00 + value_f:0.90 + value_f:0 value_f: 190.00 value_f: 110.00 value_f: 290.00 value_f: 10.00 + value_f:0.90 + value_f:0 value_f: 195.00 value_f: 105.00 value_f: 295.00 value_f: 5.00 + value_f:0.90 + value_f:0 value_f: 200.00 value_f: 100.00 value_f: 300.00 value_f: 0.00 + value_f:0.90 + value_f:0 value_f: 101.00 value_f: 200.00 value_f: 201.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 108.00 value_f: 200.00 value_f: 208.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 127.00 value_f: 200.00 value_f: 227.00 value_f: 100.00 + value_f:0.90 + value_f:0 value_f: 137.00 value_f: 200.00 value_f: 237.00 value_f: 100.00 + value_f:0.90 } output { id: "output"