Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature](mlu-ops): deprecate cnnl api and mlu-ops deprecated api #909

Merged
merged 1 commit into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions core/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
namespace mluop {
// This function is used to get high 32bit and low 32bit of param value.
// The hardware hasn't support 8 bytes operation, so if the sizeof(dtype) is 8
// bytes, sometimes we need to separate 8bytes to two 4bytes. Example:for
// mluOpPad, users will pass the host pointer of padding_value to mluOpPad.
// bytes, sometimes we need to separate 8bytes to two 4bytes. Example:
// uint32_t high_value = 0, low_value = 0;
// if (getSizeOfDataType(dtype) == sizeof(int64_t)) {
// getLowAndHighValueFrom64Bits(*(int64_t*)padding_value_ptr, &high_value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,23 +290,34 @@ KERNEL_CHECK((KernelMaskFillCoorsForward(
k_dim, k_type, handle->queue, num_points, coors)));
```
- kernel2: mluOpUnique_v2
- kernel2: cnnlUnique_v2
该 kernel 用于完成 3.1.1节 第 2 点;
```c++
mluOpUniqueSort_t unique_mode = MLUOP_SORT_ASCEND;
mluOpUniqueDescriptor_t unique_desc;
MLUOP_CHECK(mluOpCreateUniqueDescriptor(&unique_desc));
MLUOP_CHECK(mluOpSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
//unique op
MLUOP_CHECK((mluOpUnique_v2(handle, unique_desc, coors_desc,
coors, workspace, workspace_size,
(int *)voxel_num, voxel_coors_desc,
voxel_coors, point2voxel_map_desc,
point2voxel_map, voxel_points_count_desc,
voxel_points_count)));
MLUOP_CHECK(mluOpDestroyUniqueDescriptor(unique_desc));
cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND;
cnnlUniqueDescriptor_t unique_desc;
CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc));
CALL_CNNL(cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(coors_desc, cnnl_input_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_coors_desc, cnnl_output_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(point2voxel_map_desc, cnnl_indices_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_points_count_desc, cnnl_counts_desc);
CALL_CNNL(cnnlUnique_v2(cnnl_handle, unique_desc, cnnl_input_desc,
coors, workspace, workspace_size, (int *)voxel_num,
cnnl_output_desc, voxel_coors, cnnl_indices_desc,
point2voxel_map, cnnl_counts_desc, voxel_points_count));
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_counts_desc);
DESTROY_CNNL_HANDLE(cnnl_handle);
CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc));
int32_t num_voxels = 0;
cnrtMemcpy(&num_voxels, voxel_num, sizeof(int), CNRT_MEM_TRANS_DIR_DEV2HOST);
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,11 @@ mluOpMaskedCol2imForward(mluOpHandle_t handle,
### 3.1 实现方案

- 该算子会将col中的channel * mask_cnt个数据赋值给im,每个channel的数据可以作为一组进行处理。为了使输入输出数据在channel维度连续,需要将输入col的shape由[co, mask_cnt]转置为[mask_cnt, co],输出im由NCHW转置为NHWC。具体步骤如下:
- step1、host端调用mluOpTranspose_v2对col进行转置
- step1、host端调用 transpose op 对col进行转置
- step2、根据channel的大小选择不同处理逻辑:
- step2-1、如果满足channel <= MAX_NRAM_SIZE / sizeof(T),此时nram能够容纳多个channel,可以一次性memcpy(GDRAM2NRAM)连续MAX_NRAM_SIZE / sizeof(T) / channel * channel数据;分别计算这些channel对应im中的位置(mask_h_idx[index%mask_cnt], mask_w_idx[index%mask_cnt]),依次memcpy(NRAM2GDRAM)至im,其中index的范围为[0, mask_cnt]
- step2_2、如果满足channel > MAX_NRAM_SIZE / sizeof(T),此时使用GDRAM2GDRAM的memcpy对单个位置的channel进行处理。
- step3、host端调用mluOpTranspose_v2对im进行转置
- step3、host端调用 transpose op 对im进行转置

### 3.2 伪代码实现(可选)

Expand Down
10 changes: 5 additions & 5 deletions docs/design_docs/masked_im2col_forward/masked_im2col_forward.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,15 +144,15 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle,
### 3.1 实现方案
![im2col2](./im2col2.png)
- step1. host代码首先将feature的layout由NCHW转置为NHWC,转置后的feature保存在workspace中;
- step2. host代码中调用mluOpFill()对workspace中的data_col内存部分进行刷0;
- step2. host代码中调用cnnlFill()对workspace中的data_col内存部分进行刷0;
- step3. 判断mask覆盖的区域在feature中实际有效的部分,仅需处理实际有效部分的像素即可,使用GDRAM2GDRAM的memcpy将mask在feature中的有效覆盖区域拷贝到workspace中的data_col对应位置。
- step4. host代码将workspace中的data_col的shape由$`[mask\_cnt, kernel\_h*kernel\_w*channels]`$转回$`[kernel\_h*kernel\_w*channels, mask\_cnt]`$,转置后的数据保存在data_col中;
### 3.2 伪代码实现(可选)

```c++
//host
//mluOpTranspose_v2对feature进行转置
//mluOpFill()对workspace中的data_col内存刷成0
// host
// cnnlTranspose_v2对feature进行转置
// cnnlFill()对workspace中的data_col内存刷成0

//kernel
for (int mask_index = taskId; mask_index < mask_cnt; ++mask_index) {
Expand Down Expand Up @@ -196,7 +196,7 @@ for (int mask_index = taskId; mask_index < mask_cnt; ++mask_index) {
}

//host
//mluOpTranspose_v2对data_col进行转置
//cnnlTranspose_v2对data_col进行转置
```

### 3.3 拆分(任务拆分,多核拆分)
Expand Down
5 changes: 3 additions & 2 deletions docs/design_docs/roipoint_pool3d/roipoint_pool3d.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ mluOpRoiPointPool3d(mluOpHandle_t handle,
### 3.1 实现方案

roipoint_pool3d算子实现的功能是筛选出3D boxes内的点云数据坐标和特征,mmcv内cuda实现拆分为3个kernel,mlu合并成1个kernel进行实现。
1. 首先对每个3d box遍历batch内所有点云数据,计算其是否在box边框内部。从mlu计算效率角度出发,计算需采用矢量运算(计算公式详见1.2小节),即拆成x、y、z矢量分别计算。但是points输入数据规模为[B, N, 3], x矢量、y矢量、z矢量在低纬度不连续,因此计算前要通过transpose将points规模[B, N, 3]转置为[3, B, N]。这里做统一处理,将points以其数据规模[B, N, 3]整体做转置,放到第一步来做,通过调用mluOpTranspose_v2将计算结果存储到workspace空间。
1. 首先对每个3d box遍历batch内所有点云数据,计算其是否在box边框内部。从mlu计算效率角度出发,计算需采用矢量运算(计算公式详见1.2小节),即拆成x、y、z矢量分别计算。但是points输入数据规模为[B, N, 3], x矢量、y矢量、z矢量在低纬度不连续,因此计算前要通过transpose将points规模[B, N, 3]转置为[3, B, N]。这里做统一处理,将points以其数据规模[B, N, 3]整体做转置,放到第一步来做,通过调用 transpose op 对feature进行转置
将计算结果存储到workspace空间。
2. 多核拆分从B、M两个维度进行拆分(详见3.3小节),每个core处理拆分到的3d boxes,最小计算单元为计算3d box对应batch内的N个点云坐标是否在box边框内部。
3. 最小计算单元见如下伪代码。输入数据boxes3d表征LiDAR坐标系下3d box的参数(cx, cy, cz, dx, dy, dz, rz),其中,(cx, cy, cz)表示3d box底面中心点坐标,(dx, dy, dz)分别表示3d box的长宽高,rz表示box在俯视图下的朝向角(xy平面内),朝向角为x轴方向逆时针到box朝向的角度。(x, y, z)为点云数据在LiDAR坐标系中的坐标,旋转坐标系,计算点云数据在box坐标系(以box长宽高为x轴、y轴、z轴)中的坐标(local_x, local_y, z),并判断其是否在box边框内部。
4. 统计3d box内的点云数据数量,得到输出pooled_empty_flag。此外,不足采样点数则duplicate至采样点数,超出采样点数则做截取。
Expand Down Expand Up @@ -440,7 +441,7 @@ void check_pts_in_box3d(const T *boxes3d,

- workspace空间划分

points转置按其数据规模[3, B, N]申请workspace空间,point_features转置按其数据规模[B, C, N]申请workspace空间。输入数据points规模为[B, N, 3],point_features规模为[B, N, C]当前版本mluOpTranspose_v2调用mluOpGetTransposeWorkspaceSize根据此参数计算不需额外workspace空间
points转置按其数据规模[3, B, N]申请workspace空间,point_features转置按其数据规模[B, C, N]申请workspace空间。输入数据points规模为[B, N, 3],point_features规模为[B, N, C]当前版本 transpose op 调用cnnlGetTransposeWorkspaceSize根据此参数计算不需额外workspace空间

2、流水设计

Expand Down
33 changes: 1 addition & 32 deletions kernel_depends.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,10 @@
# ## <kernel_name> = ["op_1", "op_2"]

abs = ["unary_op"]
copy = ["tensor_stride_process"]
div = ["binary_op"]
expand = ["copy"]
fill = ["tensor_stride_process"]
log = ["unary_op"]
psroipool = ["fill"]
roi_align_rotated = ["fill"]
roi_crop = ["fill"]
rotated_feature_align = ["fill"]
sqrt = ["binary_op", "unary_op"]
tensor_stride_process = ["copy"]
moe_dispatch_backward_data = ["fill"]
roiaware_pool3d = ["fill","transpose"]
voxelization = ["fill"]
get_indice_pairs = ["fill", "scatter_nd", " gather_nd", "reduce", "unique"]
yolo_box = ["fill"]
deform_roi_pool = ["fill"]
moe_dispatch_backward_gate = ["fill"]
indice_convolution_backward_filter = ["fill", "transpose", "gather_nd", "matmul"]
indice_convolution_forward = ["add_n", "fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
roipoint_pool3d = ["transpose"]
nms = ["transpose"]
carafe = ["fill", "tensor_stride_process"]
ms_deform_attn_forward = ["fill"]
dynamic_point_to_voxel_backward = ["scatter_nd", "fill"]
dynamic_point_to_voxel_forward = ["fill", "unique"]
masked_col2im_forward = ["fill", "transpose"]
masked_im2col_forward = ["fill", "transpose"]
mutual_information_backward = ["fill"]
batch_matmul_bcast = ["matmul"]
carafe = ["tensor_stride_process"]

[gtest]

Expand All @@ -53,11 +27,6 @@ roi_crop_backward = ["roi_crop"]
roi_crop_forward = ["roi_crop"]
rotated_feature_align_backward = ["rotated_feature_align"]
rotated_feature_align_forward = ["rotated_feature_align"]
voxel_pooling_forward = ["fill"]
ms_deform_attn_backward = ["fill"]
border_align_backward = ["fill"]
indice_convolution_backward_data = ["fill", "scatter_nd", "gather_nd", "transpose", "matmul", "add_n"]
indice_convolution_forward = ["add_n","fill", "gather_nd", "matmul", "scatter_nd", "transpose"]
sqrt_backward = ["sqrt"]
tin_shift_backward = ["tin_shift"]
tin_shift_forward = ["tin_shift"]
Expand Down
120 changes: 0 additions & 120 deletions kernels/add_n/add_n.cpp

This file was deleted.

Loading