Cambricon · wenzhengyin · Aug 12, 2022 · Aug 12, 2022 · Aug 12, 2022 · Aug 12, 2022
diff --git a/bangc-ops/kernels/ml_nms/ml_nms.cpp b/bangc-ops/kernels/ml_nms/ml_nms.cpp
@@ -0,0 +1,112 @@
+/*************************************************************************
+* Copyright (C) [2022] by Cambricon, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+************************************************************************/
+#include <stdio.h>
+#include <string>
+#include "core/context.h"
+#include "core/gen_case.h"
+#include "core/logging.h"
+#include "core/runtime/device.h"
+#include "core/tensor.h"
+#include "core/type.h"
+#include "mlu_op_kernel.h"
+#include "mlu_op.h"
+#include "cnrt.h"
+#include "cndev.h"
+
+static inline bool isSupportType(const mluOpDataType_t check_type,
+                                 const mluOpDataType_t support_type[],
+                                 const int len) {
+  for (int i = 0; i < len; ++i) {
+    if (check_type == support_type[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+mluOpStatus_t MlNmsParamCheck(
-mluOpStatus_t MlNmsParamCheck(
+mluOpStatus_t mlNmsParamCheck(
-mluOpStatus_t MlNmsParamCheck(
+mluOpStatus_t mlNmsParamCheck(
+  const std::string &op_name, const mluOpHandle_t &handle,
+  const mluOpTensorDescriptor_t &x_desc, const void *x,
+  const mluOpDataType_t support_type[], const int &len) {
+  PARAM_CHECK(op_name, x_desc != NULL);
+  PARAM_CHECK(op_name, handle != NULL);
+
+  // check data type
+  if (!isSupportType(x_desc->dtype, support_type, len)) {
+    LOG(ERROR) << op_name << ":x_desc's data type is not supported.";
+    return MLUOP_STATUS_BAD_PARAM;
+  }
+  PARAM_CHECK(op_name, x != NULL);
+  return MLUOP_STATUS_SUCCESS;
+}
+
+
+static void policyFunc(const mluOpHandle_t &handle,
+                       const mluOpTensorDescriptor_t desc, cnrtDim3_t *k_dim,
+                       cnrtFunctionType_t *k_type) {
+  size_t dim = mluOpGetTensorElementNum(desc);
+  // Union1 policyFunc
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = handle->core_num_per_cluster;
+  k_dim->y = mluop::runtime::getClusterLimitCapability(handle);
+  k_dim->z = 1;
+  // if a case is smaller than 2048 , it just need one cluster can work best.
+  size_t small_case_thread = 2048;
+  if (dim <= small_case_thread) k_dim->y = 1;
+}
+
+mluOpStatus_t MLUOP_WIN_API mluOpMlNms(mluOpHandle_t handle,
+    const mluOpTensorDescriptor_t boxes_data_ptr_desc, void* boxes_data_ptr,
+    float iou_threshold, void* output_boxes_index) {
+
+    mluOpDataType_t support_type[2] = {MLUOP_DTYPE_HALF, MLUOP_DTYPE_FLOAT};
+    mluOpStatus_t param_check = MlNmsParamCheck(
+      "[mluOpMlNms]", handle, boxes_data_ptr_desc, boxes_data_ptr,
+      support_type, 2);
+
+    if (param_check != MLUOP_STATUS_SUCCESS) {
+      return param_check;
+    }
+
+    cnrtDim3_t k_dim;
+    cnrtFunctionType_t k_type;
+    policyFunc(handle, boxes_data_ptr_desc, &k_dim, &k_type);
+    int input_boxes_num = boxes_data_ptr_desc->total_element_num / 4;
+    void (*mluOpFuncKernel)(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+      cnrtQueue_t queue, mluOpDataType_t data_type, void* boxes_data_ptr,
+      float nmsThres, int input_boxes_num, uint8_t* output_boxes_index);
+
+      if (boxes_data_ptr_desc->dtype == MLUOP_DTYPE_HALF) {
+          mluOpFuncKernel = mluOpKernelMlNmsHalfFast;
+      } else {
+          mluOpFuncKernel = mluOpKernelMlNmsFloatFast;
+      }
+
+    KERNEL_CHECK(
+      (mluOpFuncKernel(k_dim, k_type, handle->queue,
+         boxes_data_ptr_desc->dtype, boxes_data_ptr,
+         iou_threshold, input_boxes_num, (uint8_t*)output_boxes_index)));
+    GEN_CASE_END();
+
+    return MLUOP_STATUS_SUCCESS;
+}
diff --git a/bangc-ops/kernels/ml_nms/ml_nms.mlu b/bangc-ops/kernels/ml_nms/ml_nms.mlu
@@ -0,0 +1,230 @@
+/*************************************************************************
+* Copyright (C) [2022] by Cambricon, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+************************************************************************/
+#include "bang.h"
+#include "mlu_op_kernel.h"
+#include "kernels/kernel.h"
+
+#define NRAM_SIZE 2 * 1024
+#define UNION_OP_KERNEL_DECLARE(Op, DType, Prefer)           \
+  __mlu_global__ void MLUBlockKernel##Op##DType##Prefer(\
+    mluOpDataType_t data_type, void* boxes_data_ptr, \
+    float nms_thres, int input_boxes_num, uint8_t* output_boxes_index);\
+
+#define UNION_OP_KERNEL_IMPLE(Op, DType, Prefer)                 \
+  __mlu_global__ void MLUOpKernel##Op##DType##Prefer(     \
+    mluOpDataType_t data_type, void* boxes_data_ptr, \
+    float nms_thres, int input_boxes_num, uint8_t* output_boxes_index) {\
+    int offset, seg; \
+    getOffsetNum##Op##Prefer(input_boxes_num, &offset); \
+    getSegNumMlNmsFast(input_boxes_num, &seg); \
+    unionImple<DType, compute##Op##Prefer>( \
+    (DType*)boxes_data_ptr, (DType)nms_thres, \
+    offset, seg, input_boxes_num, output_boxes_index);}
+
+template <typename T, void (*OpFunc)(T*, T, int, int, int, uint8_t*)>
+__mlu_device__ void unionImple(T* boxes_data_ptr, T nms_thres, int offset,
+  int seg, int input_boxes_num, uint8_t* output_boxes_index) {
+  __nram__ char worke_space[MAX_NRAM_SIZE / 16];
-  __nram__ char worke_space[MAX_NRAM_SIZE / 16];
+  __nram__ char work_space[MAX_NRAM_SIZE / 16];
-  __nram__ char worke_space[MAX_NRAM_SIZE / 16];
+  __nram__ char work_space[MAX_NRAM_SIZE / 16];
+  __memcpy((T*)worke_space,
+           boxes_data_ptr + (offset * 4),
+           seg * 4 * sizeof(T),
+           GDRAM2NRAM);
+  __memcpy((T*)worke_space + (seg * 4),
+           boxes_data_ptr,
+           4 * sizeof(T),
+           GDRAM2NRAM);
+  OpFunc((T*)worke_space, nms_thres, input_boxes_num, offset,
+    seg, output_boxes_index);
+}
+
+__mlu_func__ void getComputeLen(int seg, int elem_byte, int* compute_len) {
+#if (__BANG_ARCH__ < 200)
+  *compute_len = (seg * elem_byte % 64) == 0 ?
+    seg : (seg * elem_byte / 64 + 1) * 64 / elem_byte;
+#elif (__BANG_ARCH__ > 200 && __BANG_ARCH__ < 300)
+  *compute_len = (seg * elem_byte % 128) == 0 ?
+    seg : (seg * elem_byte / 128 + 1) * 128 / elem_byte;
+#elif (__BANG_ARCH__ > 300)
+  *compute_len = seg;
+#endif
+}
+__mlu_func__ void getOffsetNumMlNmsFast(int input_boxes_num, int* offset) {
+  if (taskDim > 1) {
+    *offset = (input_boxes_num % taskDim) > taskId ?
+      (input_boxes_num / taskDim + 1) * taskId :
+      (input_boxes_num / taskDim) * taskId + (input_boxes_num % taskDim);
+  } else {
+    *offset = input_boxes_num;
+  }
+}
+
+__mlu_func__ void getSegNumMlNmsFast(int input_boxes_num, int* seg) {
+  if (taskDim > 1) {
+    *seg = (input_boxes_num / taskDim) +
+           uint32_t((input_boxes_num % taskDim) > taskId);
+  } else {
+    *seg = input_boxes_num;
+  }
+}
+
+template <typename T>
+__mlu_func__ void computeMlNmsFast(T* worke_space,
+  T nms_thres, int input_boxes_num, int offset,
+  int seg, uint8_t* output_boxes_index) {
+  __nram__ T scores_max_boxes_area;
+  __nram__ T w_s, h_s;
+  __nram__ T* scores_max_boxes;
+  __nram__ T* x1;
+  __nram__ T* y1;
+  __nram__ T* x2;
+  __nram__ T* y2;
+  __nram__ T* w;
+  __nram__ T* h;
+  __nram__ T* area_ptr;
+  __nram__ T* inter_area_ptr;
+  __nram__ T* scores_max_boxes_area_ptr;
+  __nram__ T* nms_thres_ptr;
+  __nram__ T* scores_max_boxes_ptr;
+  __nram__ T* tem;
+  __nram__ uint8_t* result;
+  int compute_len;
+  int i, j;
+  int data_len = seg * 4 + 4;
+
+  getComputeLen(seg, sizeof(T), &compute_len);
+  scores_max_boxes = worke_space + (seg * 4);
+  x1 = worke_space + data_len;
+  y1 = worke_space + (data_len + compute_len);
+  x2 = worke_space + (data_len + (compute_len * 2));
+  y2 = worke_space + (data_len + (compute_len * 3));
-  x1 = worke_space + data_len;
-  y1 = worke_space + (data_len + compute_len);
-  x2 = worke_space + (data_len + (compute_len * 2));
-  y2 = worke_space + (data_len + (compute_len * 3));
+  x1 = worke_space + data_len;
+  y1 = x1 + compute_len;
+  x2 = y1 + compute_len;
+  y2 = x2 + compute_len;
-  x1 = worke_space + data_len;
-  y1 = worke_space + (data_len + compute_len);
-  x2 = worke_space + (data_len + (compute_len * 2));
-  y2 = worke_space + (data_len + (compute_len * 3));
+  x1 = worke_space + data_len;
+  y1 = x1 + compute_len;
+  x2 = y1 + compute_len;
+  y2 = x2 + compute_len;
+
+  data_len = data_len + (compute_len * 4);
+
+  for (i = 0, j = 0; i < seg * 4; i+=4, j++) {
+    x1[j] = worke_space[i];
+    y1[j] = worke_space[i + 1];
+    x2[j] = worke_space[i + 2];
+    y2[j] = worke_space[i + 3];
+  }
+  w = worke_space + data_len;
+  h = worke_space + (data_len + compute_len);
+  area_ptr = worke_space + (data_len + (compute_len * 2));
+  inter_area_ptr = worke_space + (data_len + (compute_len * 3));
+  scores_max_boxes_area_ptr = worke_space + (data_len + (compute_len * 4));
+  nms_thres_ptr = worke_space + (data_len + (compute_len * 5));
+  scores_max_boxes_ptr = worke_space + (data_len + (compute_len * 6));
+  tem = worke_space + (data_len + (compute_len * 7));
+  result = (uint8_t*)worke_space + (data_len + (compute_len * 8));
+
+  // -----------------iou detect--------------------
+  // fing all boxes area
+  __bang_sub(h, y1, y2, compute_len);
+  __bang_sub(w, x2, x1, compute_len);
+  __bang_mul(area_ptr, h, w, compute_len);
+
+  // max x1
+  __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[0]);
+  __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
-  __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
+  __bang_sub(x1, x1, scores_max_boxes_ptr);
-  __bang_cycle_sub(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
+  __bang_sub(x1, x1, scores_max_boxes_ptr);
+  __bang_active_relu(x1, x1, compute_len);
+  __bang_cycle_add(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
-  __bang_cycle_add(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
+  __bang_add(x1, x1, scores_max_boxes_ptr);
-  __bang_cycle_add(x1, x1, scores_max_boxes_ptr, compute_len, compute_len);
+  __bang_add(x1, x1, scores_max_boxes_ptr);
+
+  // min y1
+  __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[1]);
+  __bang_write_zero(tem, compute_len);
+  __bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len);
+  __bang_sub(tem, y1, scores_max_boxes_ptr, compute_len);
+  __bang_active_relu(tem, tem, compute_len);
+  __bang_sub(y1, y1, tem, compute_len);
+
+  // min x2
+  __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[2]);
+  __bang_write_zero(tem, compute_len);
+  __bang_cycle_add(tem, tem, scores_max_boxes_ptr, compute_len, compute_len);
+  __bang_sub(tem, x2, scores_max_boxes_ptr, compute_len);
+  __bang_active_relu(tem, tem, compute_len);
+  __bang_sub(x2, x2, tem, compute_len);
+
+  // max y2
+  __bang_write_value(scores_max_boxes_ptr, compute_len, scores_max_boxes[3]);
+  __bang_cycle_sub(y2, y2, scores_max_boxes_ptr, compute_len, compute_len);
+  __bang_active_relu(y2, y2, compute_len);
+  __bang_cycle_add(y2, y2, scores_max_boxes_ptr, compute_len, compute_len);
+
+  // --------- intesection-------
+  // fing W
+  __bang_sub(w, x2, x1, compute_len);
+  __bang_active_relu(w, w, compute_len);
+
+  // find H
+  __bang_sub(h, y1, y2, compute_len);
+  __bang_active_relu(h, h, compute_len);
+
+  // fing intersection
+  __bang_mul(inter_area_ptr, h, w, compute_len);
+
+  // fing scores max boxes area
+  w_s = scores_max_boxes[2] - scores_max_boxes[0];
+  h_s = scores_max_boxes[1] - scores_max_boxes[3];
+  scores_max_boxes_area = w_s * h_s;
+
+  __bang_write_value(scores_max_boxes_area_ptr, compute_len,
+    scores_max_boxes_area);
+  __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr,
+    compute_len, compute_len);
-  __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr,
-    compute_len, compute_len);
+  __bang_add(tem, area_ptr, scores_max_boxes_area_ptr);
-  __bang_cycle_add(tem, area_ptr, scores_max_boxes_area_ptr,
-    compute_len, compute_len);
+  __bang_add(tem, area_ptr, scores_max_boxes_area_ptr);
+  __bang_sub(tem, tem, inter_area_ptr, compute_len);
+  __bang_write_value(nms_thres_ptr, compute_len, nms_thres);
+  __bang_cycle_mul(tem, tem, nms_thres_ptr, compute_len, compute_len);
+
+  __bang_le(tem, inter_area_ptr, tem, compute_len);
+
+  for (int i = 0; i < seg; i++) {
+    if (tem[i]) {
+      result[i] = 1;
+    } else {
+      result[i] = 0;
+    }
+  }
+  __memcpy(output_boxes_index + offset, result, seg * sizeof(uint8_t),
+    NRAM2GDRAM);
+}
+
+UNION_OP_KERNEL_IMPLE(MlNms, float, Fast);
+UNION_OP_KERNEL_IMPLE(MlNms, half, Fast);
+
+void MLUOP_WIN_API mluOpKernelMlNmsFloatFast(
+  cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+  mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres,
+  int input_boxes_num, uint8_t* output_boxes_index) {
+  MLUOpKernelMlNmsfloatFast<<<k_dim, k_type, queue>>>(
+    data_type, boxes_data_ptr, nms_thres,
+    input_boxes_num, output_boxes_index);
+}
+
+void MLUOP_WIN_API mluOpKernelMlNmsHalfFast(
+  cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+  mluOpDataType_t data_type, void* boxes_data_ptr, float nms_thres,
+  int input_boxes_num, uint8_t* output_boxes_index) {
+  MLUOpKernelMlNmshalfFast<<<k_dim, k_type, queue>>>(
+    data_type, boxes_data_ptr, nms_thres,
+    input_boxes_num, output_boxes_index);
+}
diff --git a/bangc-ops/mlu_op.h b/bangc-ops/mlu_op.h
@@ -1464,6 +1464,30 @@ mluOpGetTensorAndDataFromTensorSet(mluOpTensorSetDescriptor_t tensorSetDesc,
                                    mluOpTensorDescriptor_t *tensorDesc,
                                    void **dataAddrInDevice);
 
+/*
+ *
+ * @param handle : Set the handle to the MLU
+ *
+ * @param mluOpTensorDescriptor_t : Properties of the input data
+ *
+ * @param boxesDataPtr : The coordinates of the input box
+ *
+ * @param scoresMaxBoxesDataPtr : Coordin of the box with maximum accuracy
+ *
+ * @param inputBoxesNum : input box number
+ *
+ * @param iouThreshold : Threshold of intersection and union ratio
+ *
+ * @param outputBoxesIndex : Index of the output box
+ *
+ */
+
+mluOpStatus_t MLUOP_WIN_API
+mluOpMlNms(mluOpHandle_t handle,
+    const mluOpTensorDescriptor_t boxes_data_ptr_desc,
+    void* boxes_data_ptr,
+    float iou_threshold,
+    void* output_boxes_index);
 // Group:Abs
 /*!
  * @brief Computes the absolute value for every element of the input tensor \b x