Skip to content

Commit

Permalink
[Feature](mlu-ops): deprecate cnnl api and mlu-ops deprecated api
Browse files Browse the repository at this point in the history
  • Loading branch information
duzekunKTH committed Jan 12, 2024
1 parent ab9eec3 commit 0d2db79
Show file tree
Hide file tree
Showing 137 changed files with 5,578 additions and 18,758 deletions.
3 changes: 1 addition & 2 deletions core/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
namespace mluop {
// This function is used to get high 32bit and low 32bit of param value.
// The hardware hasn't support 8 bytes operation, so if the sizeof(dtype) is 8
// bytes, sometimes we need to separate 8bytes to two 4bytes. Example:for
// mluOpPad, users will pass the host pointer of padding_value to mluOpPad.
// bytes, sometimes we need to separate 8bytes to two 4bytes. Example:
// uint32_t high_value = 0, low_value = 0;
// if (getSizeOfDataType(dtype) == sizeof(int64_t)) {
// getLowAndHighValueFrom64Bits(*(int64_t*)padding_value_ptr, &high_value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,23 +290,34 @@ KERNEL_CHECK((KernelMaskFillCoorsForward(
k_dim, k_type, handle->queue, num_points, coors)));
```
- kernel2: mluOpUnique_v2
- kernel2: cnnlUnique_v2
该 kernel 用于完成 3.1.1节 第 2 点;
```c++
mluOpUniqueSort_t unique_mode = MLUOP_SORT_ASCEND;
mluOpUniqueDescriptor_t unique_desc;
MLUOP_CHECK(mluOpCreateUniqueDescriptor(&unique_desc));
MLUOP_CHECK(mluOpSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
//unique op
MLUOP_CHECK((mluOpUnique_v2(handle, unique_desc, coors_desc,
coors, workspace, workspace_size,
(int *)voxel_num, voxel_coors_desc,
voxel_coors, point2voxel_map_desc,
point2voxel_map, voxel_points_count_desc,
voxel_points_count)));
MLUOP_CHECK(mluOpDestroyUniqueDescriptor(unique_desc));
cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND;
cnnlUniqueDescriptor_t unique_desc;
CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc));
CALL_CNNL(cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(coors_desc, cnnl_input_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_coors_desc, cnnl_output_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(point2voxel_map_desc, cnnl_indices_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_points_count_desc, cnnl_counts_desc);
CALL_CNNL(cnnlUnique_v2(cnnl_handle, unique_desc, cnnl_input_desc,
coors, workspace, workspace_size, (int *)voxel_num,
cnnl_output_desc, voxel_coors, cnnl_indices_desc,
point2voxel_map, cnnl_counts_desc, voxel_points_count));
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_counts_desc);
DESTROY_CNNL_HANDLE(cnnl_handle);
CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc));
int32_t num_voxels = 0;
cnrtMemcpy(&num_voxels, voxel_num, sizeof(int), CNRT_MEM_TRANS_DIR_DEV2HOST);
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,11 @@ mluOpMaskedCol2imForward(mluOpHandle_t handle,
### 3.1 实现方案

- 该算子会将col中的channel * mask_cnt个数据赋值给im,每个channel的数据可以作为一组进行处理。为了使输入输出数据在channel维度连续,需要将输入col的shape由[co, mask_cnt]转置为[mask_cnt, co],输出im由NCHW转置为NHWC。具体步骤如下:
- step1、host端调用mluOpTranspose_v2对col进行转置
- step1、host端调用 transpose op 对col进行转置
- step2、根据channel的大小选择不同处理逻辑:
- step2-1、如果满足channel <= MAX_NRAM_SIZE / sizeof(T),此时nram能够容纳多个channel,可以一次性memcpy(GDRAM2NRAM)连续MAX_NRAM_SIZE / sizeof(T) / channel * channel数据;分别计算这些channel对应im中的位置(mask_h_idx[index%mask_cnt], mask_w_idx[index%mask_cnt]),依次memcpy(NRAM2GDRAM)至im,其中index的范围为[0, mask_cnt]
- step2_2、如果满足channel > MAX_NRAM_SIZE / sizeof(T),此时使用GDRAM2GDRAM的memcpy对单个位置的channel进行处理。
- step3、host端调用mluOpTranspose_v2对im进行转置
- step3、host端调用 transpose op 对im进行转置

### 3.2 伪代码实现(可选)

Expand Down
10 changes: 5 additions & 5 deletions docs/design_docs/masked_im2col_forward/masked_im2col_forward.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,15 +144,15 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle,
### 3.1 实现方案
![im2col2](./im2col2.png)
- step1. host代码首先将feature的layout由NCHW转置为NHWC,转置后的feature保存在workspace中;
- step2. host代码中调用mluOpFill()对workspace中的data_col内存部分进行刷0;
- step2. host代码中调用cnnlFill()对workspace中的data_col内存部分进行刷0;
- step3. 判断mask覆盖的区域在feature中实际有效的部分,仅需处理实际有效部分的像素即可,使用GDRAM2GDRAM的memcpy将mask在feature中的有效覆盖区域拷贝到workspace中的data_col对应位置。
- step4. host代码将workspace中的data_col的shape由$`[mask\_cnt, kernel\_h*kernel\_w*channels]`$转回$`[kernel\_h*kernel\_w*channels, mask\_cnt]`$,转置后的数据保存在data_col中;
### 3.2 伪代码实现(可选)

```c++
//host
//mluOpTranspose_v2对feature进行转置
//mluOpFill()对workspace中的data_col内存刷成0
// host
// cnnlTranspose_v2对feature进行转置
// cnnlOpFill()对workspace中的data_col内存刷成0

//kernel
for (int mask_index = taskId; mask_index < mask_cnt; ++mask_index) {
Expand Down Expand Up @@ -196,7 +196,7 @@ for (int mask_index = taskId; mask_index < mask_cnt; ++mask_index) {
}

//host
//mluOpTranspose_v2对data_col进行转置
//cnnlTranspose_v2对data_col进行转置
```

### 3.3 拆分(任务拆分,多核拆分)
Expand Down
5 changes: 3 additions & 2 deletions docs/design_docs/roipoint_pool3d/roipoint_pool3d.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ mluOpRoiPointPool3d(mluOpHandle_t handle,
### 3.1 实现方案

roipoint_pool3d算子实现的功能是筛选出3D boxes内的点云数据坐标和特征,mmcv内cuda实现拆分为3个kernel,mlu合并成1个kernel进行实现。
1. 首先对每个3d box遍历batch内所有点云数据,计算其是否在box边框内部。从mlu计算效率角度出发,计算需采用矢量运算(计算公式详见1.2小节),即拆成x、y、z矢量分别计算。但是points输入数据规模为[B, N, 3], x矢量、y矢量、z矢量在低纬度不连续,因此计算前要通过transpose将points规模[B, N, 3]转置为[3, B, N]。这里做统一处理,将points以其数据规模[B, N, 3]整体做转置,放到第一步来做,通过调用mluOpTranspose_v2将计算结果存储到workspace空间。
1. 首先对每个3d box遍历batch内所有点云数据,计算其是否在box边框内部。从mlu计算效率角度出发,计算需采用矢量运算(计算公式详见1.2小节),即拆成x、y、z矢量分别计算。但是points输入数据规模为[B, N, 3], x矢量、y矢量、z矢量在低纬度不连续,因此计算前要通过transpose将points规模[B, N, 3]转置为[3, B, N]。这里做统一处理,将points以其数据规模[B, N, 3]整体做转置,放到第一步来做,通过调用 transpose op 对feature进行转置
将计算结果存储到workspace空间。
2. 多核拆分从B、M两个维度进行拆分(详见3.3小节),每个core处理拆分到的3d boxes,最小计算单元为计算3d box对应batch内的N个点云坐标是否在box边框内部。
3. 最小计算单元见如下伪代码。输入数据boxes3d表征LiDAR坐标系下3d box的参数(cx, cy, cz, dx, dy, dz, rz),其中,(cx, cy, cz)表示3d box底面中心点坐标,(dx, dy, dz)分别表示3d box的长宽高,rz表示box在俯视图下的朝向角(xy平面内),朝向角为x轴方向逆时针到box朝向的角度。(x, y, z)为点云数据在LiDAR坐标系中的坐标,旋转坐标系,计算点云数据在box坐标系(以box长宽高为x轴、y轴、z轴)中的坐标(local_x, local_y, z),并判断其是否在box边框内部。
4. 统计3d box内的点云数据数量,得到输出pooled_empty_flag。此外,不足采样点数则duplicate至采样点数,超出采样点数则做截取。
Expand Down Expand Up @@ -440,7 +441,7 @@ void check_pts_in_box3d(const T *boxes3d,

- workspace空间划分

points转置按其数据规模[3, B, N]申请workspace空间,point_features转置按其数据规模[B, C, N]申请workspace空间。输入数据points规模为[B, N, 3],point_features规模为[B, N, C]当前版本mluOpTranspose_v2调用mluOpGetTransposeWorkspaceSize根据此参数计算不需额外workspace空间
points转置按其数据规模[3, B, N]申请workspace空间,point_features转置按其数据规模[B, C, N]申请workspace空间。输入数据points规模为[B, N, 3],point_features规模为[B, N, C]当前版本 transpose op 调用cnnlGetTransposeWorkspaceSize根据此参数计算不需额外workspace空间

2、流水设计

Expand Down
8 changes: 3 additions & 5 deletions kernel_depends.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
# ## <kernel_name> = ["op_1", "op_2"]

abs = ["unary_op"]
copy = ["tensor_stride_process"]
div = ["binary_op"]
expand = ["copy"]
fill = ["tensor_stride_process"]
log = ["unary_op"]
psroipool = ["fill"]
Expand All @@ -26,7 +24,7 @@ yolo_box = ["fill"]
deform_roi_pool = ["fill"]
moe_dispatch_backward_gate = ["fill"]
indice_convolution_backward_filter = ["fill", "transpose", "gather_nd", "matmul"]
indice_convolution_forward = ["add_n", "fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
indice_convolution_forward = ["fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
roipoint_pool3d = ["transpose"]
nms = ["transpose"]
carafe = ["fill", "tensor_stride_process"]
Expand Down Expand Up @@ -56,8 +54,8 @@ rotated_feature_align_forward = ["rotated_feature_align"]
voxel_pooling_forward = ["fill"]
ms_deform_attn_backward = ["fill"]
border_align_backward = ["fill"]
indice_convolution_backward_data = ["fill", "scatter_nd", "gather_nd", "transpose", "matmul", "add_n"]
indice_convolution_forward = ["add_n","fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
indice_convolution_backward_data = ["fill", "scatter_nd", "gather_nd", "transpose", "matmul"]
indice_convolution_forward = ["fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
sqrt_backward = ["sqrt"]
tin_shift_backward = ["tin_shift"]
tin_shift_forward = ["tin_shift"]
Expand Down
120 changes: 0 additions & 120 deletions kernels/add_n/add_n.cpp

This file was deleted.

Loading

0 comments on commit 0d2db79

Please sign in to comment.