[Feature](mlu-ops): deprecate cnnl api and mlu-ops deprecated api

Cambricon · Jan 12, 2024 · 0d2db79 · 0d2db79
1 parent ab9eec3
commit 0d2db79
Show file tree

Hide file tree

Showing 137 changed files with 5,578 additions and 18,758 deletions.
diff --git a/core/type.h b/core/type.h
@@ -30,8 +30,7 @@
 namespace mluop {
 // This function is used to get high 32bit and low 32bit of param value.
 // The hardware hasn't support 8 bytes operation, so if the sizeof(dtype) is 8
-// bytes, sometimes we need to separate 8bytes to two 4bytes. Example:for
-// mluOpPad, users will pass the host pointer of padding_value to mluOpPad.
+// bytes, sometimes we need to separate 8bytes to two 4bytes. Example:
 // uint32_t high_value = 0, low_value = 0;
 // if (getSizeOfDataType(dtype) == sizeof(int64_t)) {
 //   getLowAndHighValueFrom64Bits(*(int64_t*)padding_value_ptr, &high_value,

diff --git a/docs/design_docs/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.md b/docs/design_docs/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.md
@@ -290,23 +290,34 @@ KERNEL_CHECK((KernelMaskFillCoorsForward(
       k_dim, k_type, handle->queue, num_points, coors)));
 ```
 
-- kernel2: mluOpUnique_v2 
+- kernel2: cnnlUnique_v2 
 
 该 kernel 用于完成 3.1.1节 第 2 点;
 
 ```c++
-  mluOpUniqueSort_t unique_mode = MLUOP_SORT_ASCEND;
-  mluOpUniqueDescriptor_t unique_desc;
-  MLUOP_CHECK(mluOpCreateUniqueDescriptor(&unique_desc));
-  MLUOP_CHECK(mluOpSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
-  //unique op
-  MLUOP_CHECK((mluOpUnique_v2(handle, unique_desc, coors_desc,
-                              coors, workspace, workspace_size,
-                              (int *)voxel_num, voxel_coors_desc,
-                              voxel_coors, point2voxel_map_desc,
-                              point2voxel_map, voxel_points_count_desc,
-                              voxel_points_count)));
-  MLUOP_CHECK(mluOpDestroyUniqueDescriptor(unique_desc));
+  cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND;
+  cnnlUniqueDescriptor_t unique_desc;
+
+  CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc));
+  CALL_CNNL(cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
+
+  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(coors_desc, cnnl_input_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_coors_desc, cnnl_output_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(point2voxel_map_desc, cnnl_indices_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_points_count_desc, cnnl_counts_desc);
+
+  CALL_CNNL(cnnlUnique_v2(cnnl_handle, unique_desc, cnnl_input_desc,
+                          coors, workspace, workspace_size, (int *)voxel_num,
+                          cnnl_output_desc, voxel_coors, cnnl_indices_desc,
+                          point2voxel_map, cnnl_counts_desc, voxel_points_count));
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_counts_desc);
+  DESTROY_CNNL_HANDLE(cnnl_handle);
+  
+  CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc));
   int32_t num_voxels = 0;
   cnrtMemcpy(&num_voxels, voxel_num, sizeof(int), CNRT_MEM_TRANS_DIR_DEV2HOST);
 ```

diff --git a/docs/design_docs/masked_col2im_forward/masked_col2im_forward.md b/docs/design_docs/masked_col2im_forward/masked_col2im_forward.md
@@ -148,11 +148,11 @@ mluOpMaskedCol2imForward(mluOpHandle_t handle,
 ### 3.1 实现方案
 
 - 该算子会将col中的channel * mask_cnt个数据赋值给im，每个channel的数据可以作为一组进行处理。为了使输入输出数据在channel维度连续，需要将输入col的shape由[co, mask_cnt]转置为[mask_cnt, co]，输出im由NCHW转置为NHWC。具体步骤如下：
-- step1、host端调用mluOpTranspose_v2对col进行转置。
+- step1、host端调用 transpose op 对col进行转置。
 - step2、根据channel的大小选择不同处理逻辑：
   - step2-1、如果满足channel <= MAX_NRAM_SIZE / sizeof(T)，此时nram能够容纳多个channel，可以一次性memcpy(GDRAM2NRAM)连续MAX_NRAM_SIZE / sizeof(T) / channel * channel数据；分别计算这些channel对应im中的位置(mask_h_idx[index%mask_cnt], mask_w_idx[index%mask_cnt])，依次memcpy(NRAM2GDRAM)至im，其中index的范围为[0, mask_cnt]。
   - step2_2、如果满足channel > MAX_NRAM_SIZE / sizeof(T)，此时使用GDRAM2GDRAM的memcpy对单个位置的channel进行处理。
-- step3、host端调用mluOpTranspose_v2对im进行转置。
+- step3、host端调用 transpose op 对im进行转置。
 
 ### 3.2 伪代码实现（可选）
 

diff --git a/docs/design_docs/masked_im2col_forward/masked_im2col_forward.md b/docs/design_docs/masked_im2col_forward/masked_im2col_forward.md
@@ -144,15 +144,15 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle,
 ### 3.1 实现方案
 ![im2col2](./im2col2.png)
 - step1. host代码首先将feature的layout由NCHW转置为NHWC，转置后的feature保存在workspace中;
-- step2. host代码中调用mluOpFill()对workspace中的data_col内存部分进行刷0；
+- step2. host代码中调用cnnlFill()对workspace中的data_col内存部分进行刷0；
 - step3. 判断mask覆盖的区域在feature中实际有效的部分，仅需处理实际有效部分的像素即可，使用GDRAM2GDRAM的memcpy将mask在feature中的有效覆盖区域拷贝到workspace中的data_col对应位置。
 - step4. host代码将workspace中的data_col的shape由$`[mask\_cnt, kernel\_h*kernel\_w*channels]`$转回$`[kernel\_h*kernel\_w*channels, mask\_cnt]`$，转置后的数据保存在data_col中;
 ### 3.2 伪代码实现（可选）
 
 ```c++
-//host
-//mluOpTranspose_v2对feature进行转置
-//mluOpFill()对workspace中的data_col内存刷成0
+// host
+// cnnlTranspose_v2对feature进行转置
+// cnnlOpFill()对workspace中的data_col内存刷成0
 
 //kernel
 for (int mask_index = taskId; mask_index < mask_cnt; ++mask_index) {
@@ -196,7 +196,7 @@ for (int mask_index = taskId; mask_index < mask_cnt; ++mask_index) {
 }
 
 //host
-//mluOpTranspose_v2对data_col进行转置
+//cnnlTranspose_v2对data_col进行转置
 ```
 
 ### 3.3 拆分(任务拆分，多核拆分)

diff --git a/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md b/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md
@@ -201,7 +201,8 @@ mluOpRoiPointPool3d(mluOpHandle_t handle,
 ### 3.1 实现方案
 
 roipoint_pool3d算子实现的功能是筛选出3D boxes内的点云数据坐标和特征，mmcv内cuda实现拆分为3个kernel，mlu合并成1个kernel进行实现。
-1. 首先对每个3d box遍历batch内所有点云数据，计算其是否在box边框内部。从mlu计算效率角度出发，计算需采用矢量运算（计算公式详见1.2小节），即拆成x、y、z矢量分别计算。但是points输入数据规模为[B, N, 3], x矢量、y矢量、z矢量在低纬度不连续，因此计算前要通过transpose将points规模[B, N, 3]转置为[3, B, N]。这里做统一处理，将points以其数据规模[B, N, 3]整体做转置，放到第一步来做，通过调用mluOpTranspose_v2将计算结果存储到workspace空间。
+1. 首先对每个3d box遍历batch内所有点云数据，计算其是否在box边框内部。从mlu计算效率角度出发，计算需采用矢量运算（计算公式详见1.2小节），即拆成x、y、z矢量分别计算。但是points输入数据规模为[B, N, 3], x矢量、y矢量、z矢量在低纬度不连续，因此计算前要通过transpose将points规模[B, N, 3]转置为[3, B, N]。这里做统一处理，将points以其数据规模[B, N, 3]整体做转置，放到第一步来做，通过调用 transpose op 对feature进行转置
+将计算结果存储到workspace空间。
 2. 多核拆分从B、M两个维度进行拆分（详见3.3小节），每个core处理拆分到的3d boxes，最小计算单元为计算3d box对应batch内的N个点云坐标是否在box边框内部。
 3. 最小计算单元见如下伪代码。输入数据boxes3d表征LiDAR坐标系下3d box的参数(cx, cy, cz, dx, dy, dz, rz)，其中，(cx, cy, cz)表示3d box底面中心点坐标，(dx, dy, dz)分别表示3d box的长宽高，rz表示box在俯视图下的朝向角（xy平面内），朝向角为x轴方向逆时针到box朝向的角度。(x, y, z)为点云数据在LiDAR坐标系中的坐标，旋转坐标系，计算点云数据在box坐标系（以box长宽高为x轴、y轴、z轴）中的坐标(local_x, local_y, z)，并判断其是否在box边框内部。
 4. 统计3d box内的点云数据数量，得到输出pooled_empty_flag。此外，不足采样点数则duplicate至采样点数，超出采样点数则做截取。
@@ -440,7 +441,7 @@ void check_pts_in_box3d(const T *boxes3d,
 
 - workspace空间划分
 
-points转置按其数据规模[3, B, N]申请workspace空间，point_features转置按其数据规模[B, C, N]申请workspace空间。输入数据points规模为[B, N, 3]，point_features规模为[B, N, C]，当前版本mluOpTranspose_v2调用mluOpGetTransposeWorkspaceSize根据此参数计算不需额外workspace空间。
+points转置按其数据规模[3, B, N]申请workspace空间，point_features转置按其数据规模[B, C, N]申请workspace空间。输入数据points规模为[B, N, 3]，point_features规模为[B, N, C]，当前版本 transpose op 调用cnnlGetTransposeWorkspaceSize根据此参数计算不需额外workspace空间。
 
 2、流水设计
 

diff --git a/kernel_depends.toml b/kernel_depends.toml
@@ -7,9 +7,7 @@
 # ##  <kernel_name> = ["op_1", "op_2"]
 
 abs = ["unary_op"]
-copy = ["tensor_stride_process"]
 div = ["binary_op"]
-expand = ["copy"]
 fill = ["tensor_stride_process"]
 log = ["unary_op"]
 psroipool = ["fill"]
@@ -26,7 +24,7 @@ yolo_box = ["fill"]
 deform_roi_pool = ["fill"]
 moe_dispatch_backward_gate = ["fill"]
 indice_convolution_backward_filter = ["fill", "transpose", "gather_nd", "matmul"]
-indice_convolution_forward = ["add_n", "fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
+indice_convolution_forward = ["fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
 roipoint_pool3d = ["transpose"]
 nms = ["transpose"]
 carafe = ["fill", "tensor_stride_process"]
@@ -56,8 +54,8 @@ rotated_feature_align_forward = ["rotated_feature_align"]
 voxel_pooling_forward = ["fill"]
 ms_deform_attn_backward = ["fill"]
 border_align_backward = ["fill"]
-indice_convolution_backward_data = ["fill", "scatter_nd", "gather_nd", "transpose", "matmul", "add_n"]
-indice_convolution_forward = ["add_n","fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
+indice_convolution_backward_data = ["fill", "scatter_nd", "gather_nd", "transpose", "matmul"]
+indice_convolution_forward = ["fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
 sqrt_backward = ["sqrt"]
 tin_shift_backward = ["tin_shift"]
 tin_shift_forward = ["tin_shift"]

diff --git a/kernels/add_n/add_n.cpp b/kernels/add_n/add_n.cpp