diff --git a/docs/MLU-OPS-OpList.md b/docs/MLU-OPS-OpList.md old mode 100644 new mode 100755 index ca0807bf3..bed2a2bf7 --- a/docs/MLU-OPS-OpList.md +++ b/docs/MLU-OPS-OpList.md @@ -101,4 +101,6 @@ MLU Binary Op算子结构:  | voxel_pooling_forward | √ | | | voxelization | √ | | | yolo_box | √ | | -| dcn_backward_data | | √ | \ No newline at end of file +| dcn_backward_data | | √ | +| dcn_forward | | √ | +| dcn_backward_weight | | √ | \ No newline at end of file diff --git a/docs/user_guide/9_operators/index.rst b/docs/user_guide/9_operators/index.rst old mode 100644 new mode 100755 index 7f5540821..ec6e56dcc --- a/docs/user_guide/9_operators/index.rst +++ b/docs/user_guide/9_operators/index.rst @@ -90,6 +90,24 @@ mluOpCopy ----------------------------- 该算子主要在语音网络中使用,对数据块进行 device 到 device 的拷贝。 +.. _dcn_backward_data: + +mluOpDCNBackwardData +--------------------------------- +该算子用于求取可变形卷积算子关于input、offset、mask的反向梯度。 + +.. _dcn_backward_weight: + +mluOpDCNBackwardWeight +----------------------------- +求取可变形卷积算子关于filter和bias的反向梯度。 + +.. _dcn_forward: + +mluOpDCNForward +----------------------------- +可变形卷积。通过额外的offset和mask来增强滤波器对空间的几何表达能力,并且该卷积可以任意替代之前卷及网络里面的任意常规卷积层。 + .. _deform_roi_pool_backward: mluOpDeformRoiPoolBackward @@ -1008,9 +1026,3 @@ mluOpConcat - ``N`` 为每个input和output的维度数。 - ``sum(axis_1, ..., axis_m)`` 表示对待拼接维度求和,output的拼接维度大小为所有input拼接维度的总和。 - 除拼接维度外,其余维度的大小需要相等。 - -.. _dcn_backward_data: - -mluOpDCNBackwardData ---------------------------------- -该算子用于求取可变形卷积算子关于input、offset、mask的反向梯度。 diff --git a/kernel_depends.toml b/kernel_depends.toml old mode 100644 new mode 100755 index f6d76f6b1..7dc5a0441 --- a/kernel_depends.toml +++ b/kernel_depends.toml @@ -41,3 +41,5 @@ deform_roi_pool_forward = ["deform_roi_pool"] deform_roi_pool_backward = ["deform_roi_pool"] carafe_forward = ["carafe"] carafe_backward = ["carafe"] +dcn_backward_weight = ["dcn_forward"] +dcn_backward_data = ["dcn_forward"] diff --git a/kernels/dcn_backward_data/dcn_backward_data.cpp b/kernels/dcn_backward_data/dcn_backward_data.cpp old mode 100644 new mode 100755 index 88ea92d65..aa20bb224 --- a/kernels/dcn_backward_data/dcn_backward_data.cpp +++ b/kernels/dcn_backward_data/dcn_backward_data.cpp @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. + * Copyright (C) [2024] by Cambricon, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -26,43 +26,7 @@ #include "kernels/utils/cnnl_helper.h" -#define DCNBPDATA_API "mluOpDcnBackwardData" - -mluOpStatus_t MLUOP_WIN_API -mluOpCreateDCNDescriptor(mluOpDCNDescriptor_t *dcn_desc) { - PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL); - CHECK_FUNC_RETURN(cnnlCreateDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS, - "[mluOpCreateDCNDescriptor] Internal error accured in " - "cnnlCreateDCNDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API -mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc) { - PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL); - CHECK_FUNC_RETURN(cnnlDestroyDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS, - "[mluOpDestroyDCNDescriptor] Internal error accured in " - "cnnlDestroyDCNDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpSetDCNDescriptor( - mluOpDCNDescriptor_t dcn_desc, int dimNb, const int pad[], - const int stride[], const int dilation[], int deformable_group, - int conv_group, int im2col_step, const mluOpDataType_t compute_type) { - PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL); - CHECK_FUNC_RETURN( - cnnlSetDCNDescriptor(dcn_desc, dimNb, pad, stride, dilation, - deformable_group, conv_group, im2col_step, - cnnlDataType_t(compute_type)), - CNNL_STATUS_SUCCESS, - "[mluOpSetDCNDescriptor] Internal error accured in " - "cnnlSetDCNDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} +#define DCNBPDATA_API "mluOpDCNBackwardData" mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBakcwardDataWorkspaceSize( mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, diff --git a/kernels/dcn_backward_weight/dcn_backward_weight.cpp b/kernels/dcn_backward_weight/dcn_backward_weight.cpp new file mode 100644 index 000000000..0f9bcb094 --- /dev/null +++ b/kernels/dcn_backward_weight/dcn_backward_weight.cpp @@ -0,0 +1,109 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include +#include +#include + +#include "kernels/utils/cnnl_helper.h" + +#define DCNBACKWARDWEIGHT_API "mluOpDCNBackwardWeight" + +mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBackwardWeightWorkspaceSize( + mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, + const mluOpTensorDescriptor_t offset_desc, + const mluOpTensorDescriptor_t mask_desc, + const mluOpTensorDescriptor_t grad_output_desc, + const mluOpTensorDescriptor_t grad_filter_desc, + const mluOpTensorDescriptor_t grad_bias_desc, size_t *size) { + PARAM_CHECK("mluOpDCNBackwardWeight", handle != NULL); + PARAM_CHECK("mluOpDCNBackwardWeight", dcn_desc != NULL); + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, _handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, _input_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, _offset_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, _mask_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc, + _grad_output_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc, + _grad_filter_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc, _grad_bias_desc); + CHECK_FUNC_RETURN( + cnnlGetDCNBackwardWeightWorkspaceSize( + _handle, dcn_desc, _input_desc, _offset_desc, _mask_desc, + _grad_output_desc, _grad_filter_desc, _grad_bias_desc, size), + CNNL_STATUS_SUCCESS, + "[mluOpDCNBackwardWeight] Internal error accured in " + "mluOpGetDCNBackwardWeightWorkspaceSize.", // NOLINT + MLUOP_STATUS_INTERNAL_ERROR); + DESTROY_CNNL_TENSOR_DESCRIPTOR(_input_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(_offset_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(_mask_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_output_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_filter_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_bias_desc); + DESTROY_CNNL_HANDLE(_handle); + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t MLUOP_WIN_API mluOpDCNBackwardWeight( + mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, const void *input, + const mluOpTensorDescriptor_t offset_desc, const void *offset, + const mluOpTensorDescriptor_t mask_desc, const void *mask, + const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output, + void *workspace, const size_t workspace_size, + const mluOpTensorDescriptor_t grad_filter_desc, void *grad_filter, + const mluOpTensorDescriptor_t grad_bias_desc, void *grad_bias) { + PARAM_CHECK(DCNBACKWARDWEIGHT_API, handle != NULL); + if (workspace_size > 0) { + PARAM_CHECK(DCNBACKWARDWEIGHT_API, workspace != NULL); + } + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc, + cnnl_grad_output_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc, + cnnl_grad_filter_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc, + cnnl_grad_bias_desc); + CHECK_FUNC_RETURN( + cnnlDCNBackwardWeight(cnnl_handle, dcn_desc, cnnl_input_desc, input, + cnnl_offset_desc, offset, cnnl_mask_desc, mask, + cnnl_grad_output_desc, grad_output, workspace, + workspace_size, cnnl_grad_filter_desc, grad_filter, + cnnl_grad_bias_desc, grad_bias), + CNNL_STATUS_SUCCESS, + "[mluOpDcnBackwardWeight] Internal error accured in " + "mluOpDcnBackwardWeight.", + MLUOP_STATUS_INTERNAL_ERROR); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_filter_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_bias_desc); + DESTROY_CNNL_HANDLE(cnnl_handle); + return MLUOP_STATUS_SUCCESS; +} diff --git a/kernels/dcn_forward/dcn_common.h b/kernels/dcn_forward/dcn_common.h new file mode 100644 index 000000000..59acab57a --- /dev/null +++ b/kernels/dcn_forward/dcn_common.h @@ -0,0 +1,69 @@ +/************************************************************************* + * Copyright (C) [2022] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef KERNELS_DCN_COMMON_DCN_COMMON_H +#define KERNELS_DCN_COMMON_DCN_COMMON_H +#include +#include +#include + +#include "kernels/utils/cnnl_helper.h" + +#define DCN_API "mluOpDCN" + +mluOpStatus_t MLUOP_WIN_API +mluOpCreateDCNDescriptor(mluOpDCNDescriptor_t *dcn_desc) { + PARAM_CHECK(DCN_API, dcn_desc != NULL); + CHECK_FUNC_RETURN(cnnlCreateDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS, + "[mluOpDcn] Internal error accured in " + "mluOpCreateDCNDescriptor.", + MLUOP_STATUS_INTERNAL_ERROR); + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t MLUOP_WIN_API +mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc) { + PARAM_CHECK(DCN_API, dcn_desc != NULL); + CHECK_FUNC_RETURN(cnnlDestroyDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS, + "[mluOpDcn] Internal error accured in " + "mluOpDestroyDCNDescriptor.", + MLUOP_STATUS_INTERNAL_ERROR); + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t MLUOP_WIN_API mluOpSetDCNDescriptor( + mluOpDCNDescriptor_t dcn_desc, int dimNb, const int pad[], + const int stride[], const int dilation[], int deformable_group, + int conv_group, int im2col_step, const mluOpDataType_t compute_type) { + PARAM_CHECK(DCN_API, dcn_desc != NULL); + CHECK_FUNC_RETURN( + cnnlSetDCNDescriptor(dcn_desc, dimNb, pad, stride, dilation, + deformable_group, conv_group, im2col_step, + cnnlDataType_t(compute_type)), + CNNL_STATUS_SUCCESS, + "[mluOpDcn] Internal error accured in " + "mluOpSetDCNDescriptor.", + MLUOP_STATUS_INTERNAL_ERROR); + return MLUOP_STATUS_SUCCESS; +} + +#endif // KERNELS_DCN_COMMON_DCN_COMMON_H diff --git a/kernels/dcn_forward/dcn_forward.cpp b/kernels/dcn_forward/dcn_forward.cpp new file mode 100644 index 000000000..c746f8971 --- /dev/null +++ b/kernels/dcn_forward/dcn_forward.cpp @@ -0,0 +1,103 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "kernels/dcn_forward/dcn_common.h" + +#define DCNFORWARD_API "mluOpDCNForward" + +mluOpStatus_t MLUOP_WIN_API mluOpGetDCNForwardWorkspaceSize( + mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, + const mluOpTensorDescriptor_t offset_desc, + const mluOpTensorDescriptor_t mask_desc, + const mluOpTensorDescriptor_t filter_desc, + const mluOpTensorDescriptor_t bias_desc, + const mluOpTensorDescriptor_t output_desc, size_t *size) { + PARAM_CHECK("mluOpDCNForward", handle != NULL); + PARAM_CHECK("mluOpDCNForward", dcn_desc != NULL); + PARAM_CHECK("mluOpDCNForward", input_desc != NULL); + PARAM_CHECK("mluOpDCNForward", offset_desc != NULL); + PARAM_CHECK("mluOpDCNForward", filter_desc != NULL); + PARAM_CHECK("mluOpDCNForward", output_desc != NULL); + PARAM_CHECK("mluOpDCNForward", size != NULL); + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc); + CHECK_FUNC_RETURN(cnnlGetDCNForwardWorkspaceSize( + cnnl_handle, dcn_desc, cnnl_input_desc, + cnnl_offset_desc, cnnl_mask_desc, cnnl_filter_desc, + cnnl_bias_desc, cnnl_output_desc, size), + CNNL_STATUS_SUCCESS, + "[mluOpDCNForward] Internal error accured in " + "mluOpGetDCNForwardWorkspaceSize.", // NOLINT + MLUOP_STATUS_INTERNAL_ERROR); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); + DESTROY_CNNL_HANDLE(cnnl_handle); + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t MLUOP_WIN_API +mluOpDCNForward(mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, const void *input, + const mluOpTensorDescriptor_t offset_desc, const void *offset, + const mluOpTensorDescriptor_t mask_desc, const void *mask, + const mluOpTensorDescriptor_t filter_desc, const void *filter, + const mluOpTensorDescriptor_t bias_desc, const void *bias, + void *workspace, size_t workspace_size, + const mluOpTensorDescriptor_t output_desc, void *output) { + PARAM_CHECK(DCNFORWARD_API, handle != NULL); + if (workspace_size > 0) { + PARAM_CHECK(DCNFORWARD_API, workspace != NULL); + } + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc); + CHECK_FUNC_RETURN( + cnnlDCNForward(cnnl_handle, dcn_desc, cnnl_input_desc, input, + cnnl_offset_desc, offset, cnnl_mask_desc, mask, + cnnl_filter_desc, filter, cnnl_bias_desc, bias, workspace, + workspace_size, cnnl_output_desc, output), + CNNL_STATUS_SUCCESS, + "[mluOpDcnForward] Internal error accured in mluOpDcnForward.", + MLUOP_STATUS_INTERNAL_ERROR); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); + DESTROY_CNNL_HANDLE(cnnl_handle); + return MLUOP_STATUS_SUCCESS; +} diff --git a/mlu_op.h b/mlu_op.h old mode 100644 new mode 100755 index 0a1795162..f530af5e3 --- a/mlu_op.h +++ b/mlu_op.h @@ -12070,7 +12070,7 @@ mluOpSetDCNDescriptor(mluOpDCNDescriptor_t dcn_desc, * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_EXECUTION_FAILED * * @par Note - * - Call this function after calling the ::mluOpDCNBackwardData, + * - Call this function after calling the ::mluOpDCNBackwardData, ::mluOpDCNForward, * or ::mluOpDCNBackwardWeight. Otherwise, \p MLUOP_STATUS_BAD_PARAM is returned. * - It is necessary to call this function destroy the deformable convolution descriptor. * to avoid the memory leaks. @@ -12087,6 +12087,470 @@ mluOpSetDCNDescriptor(mluOpDCNDescriptor_t dcn_desc, mluOpStatus_t MLUOP_WIN_API mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc); +// Group:DCN +/*! + * @brief Returns in \p workspace_size the size of the MLU memory that is used as an extra + * workspace to optimize the deformable convolution forward operation. + * + * The size of the extra workspace is determined by the deformable convolution + * forward operation, including the deformable convolution descriptor \p dcn_desc, + * input tensor descriptor \p input_desc, offset tensor + * descriptor \p offset_desc, mask tensor descriptor \p mask_desc, filter tensor descriptor + * \p filter_desc, bias tensor descriptor \p bias_desc, and output tensor descriptor \p output_desc. + * For more information about the workspace, see "Cambricon MLUOP User Guide." + * + * @param[in] handle + * Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * deformable convolution operation. For detailed information, see ::mluOpHandle_t. + * @param[in] dcn_desc + * Input. The descriptor of the deformable convolution operation. For detailed information, see + * ::mluOpDCNDescriptor_t. + * @param[in] input_desc + * Input. The descriptor of the input tensor. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] offset_desc + * Input. The descriptor of the offset tensor. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] mask_desc + * Input. The descriptor of the mask tensor. Set this parameter to NULL if mask is not needed. For detailed + * information, see ::mluOpTensorDescriptor_t. + * @param[in] filter_desc + * Input. The descriptor of the filter tensor used as a filter in the deformable convolution + * operation. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] bias_desc + * Input. The descriptor of the bias tensor. Set this parameter to NULL if bias is not needed. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] output_desc + * Input. The descriptor of the output tensor. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[out] workspace_size + * Output. Pointer to the returned size of the extra workspace in bytes that is used in the + * deformable convolution forward operation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM + * + * @par API Dependency + * - You must call the ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor functions + * to create and set the tensor descriptors \p input_desc, \p offset_desc, \p mask_desc (optional), + * \p filter_desc, and \p bias_desc (optional) before calling this function. + * - The allocated extra workspace must be passed to the ::mluOpDCNForward function to perform + * the deformable convolution forward operation. + * + * @par Note + * - None. + * + * @par Requirements + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpGetDCNForwardWorkspaceSize(mluOpHandle_t handle, + const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, + const mluOpTensorDescriptor_t offset_desc, + const mluOpTensorDescriptor_t mask_desc, + const mluOpTensorDescriptor_t filter_desc, + const mluOpTensorDescriptor_t bias_desc, + const mluOpTensorDescriptor_t output_desc, + size_t *workspace_size); + +// Group:DCN +/*! + * @brief Performs a 2D deformable convolution forward operation. Compared with the standard + * convolution, the deformable convolution introduces 2D offsets and masks to make + * the convolution adapt to the geometric variation of objects. + * Offsets act on the regular grid sampling locations, which enables a free form + * deformation of the sampling grid. The mask is a modulation mechanism that improves the ability + * to focus on pertinent image regions. Both offsets and masks are + * learnable parameters obtained from additional convolutional layers. + * + * + * @param[in] handle + * Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues. For + * detailed information, see ::mluOpHandle_t. + * @param[in] dcn_desc + * Input. The descriptor of the deformable convolution. For detailed information, see + * ::mluOpDCNDescriptor_t. + * @param[in] input_desc + * Input. The descriptor of the input tensor. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[in] input + * Input. Pointer to the MLU memory that stores the input tensor. + * @param[in] offset_desc + * Input. The descriptor of the offset tensor to be applied to each position in the convolution kernel. + The shape of the offset should be (batch, out_height, out_width, 2 * deformable_group * + filter_height, filter_width). For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] offset + * Input. Pointer to the MLU memory that stores the offset tensor. + * @param[in] mask_desc + * Input. The descriptor of the scaling factor to be applied to each position in the convolution + * kernel. The shape of the mask must be (batch, out_height, out_width, + deformable_group filter_height * filter_width). Set this parameter to NULL when + * the mask is not requested. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] mask + * Input. Pointer to the MLU memory that stores the mask tensor. Set this parameter to NULL + * when mask is not requested. + * @param[in] filter_desc + * Input. The descriptor of the filter tensor used as a filter in the deformable convolution + * operation. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] filter + * Input. Pointer to the MLU memory that stores the filter tensor. + * @param[in] bias_desc + * Input. The descriptor of the bias tensor. Set this parameter to NULL when bias is not + * requested. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] bias + * Input. Pointer to the MLU memory that stores the bias tensor. Set this parameter to NULL when bias is not + * requested. + * @param[in] workspace + * Input. Pointer to the MLU memory that is used as an extra workspace for the + * deformable convolution operation. For more information about workspace, see + * "Cambricon MLUOP User Guide". + * @param[in] workspace_size + * Input. The size of the extra workspace in bytes needed for the deformable + * convolution operation. You can get the size of the workspace with the + * ::mluOpGetDCNForwardWorkspaceSize function. + * @param[in] output_desc + * Input. The descriptor of the output tensor. The shape of output is the same with the + * shape of output in the convolution. + * @param[out] output + * Output. Pointer to the MLU memory that stores the output tensor. + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, + * ::MLUOP_STATUS_NOT_SUPPORTED, ::MLUOP_STATUS_NUMERICAL_OVERFLOW + * @par Formula + * - See "Deformable Convolution Operator" section in "Cambricon MLUOP User Guide" for details. + * + * @par Data Type + * - The off-chip data type of \p input, \p offset, \p mask, \p filter, \p bias, and \p output must be the same. + * - The supported off-chip data types of the input tensor and output tensor are as follows: + * - input, offset, mask, filter, bias, output: half, float. + * - This function supports any combinations of the following on-chip data types for input tensor + * \p input and \p filter on MLU200 series and CE3226. + * - \p input onchip data type: int16, int31. + * - \p filter onchip data type: int16, int31. + * - \p input offchip data type can be combined with any supported onchip data types. + * - \p filter offchip data type can be combined with any supported onchip data types. + * - This function also supports floating-point computation on MLU300 series or above. + * To perform floating-point computation, the onchip data type of \p input and \p filter + * should be \p MLUOP_DTYPE_INVALID or the same as the corresponding offchip data type. + * + * @par Data Layout + * - The supported data layouts of the input tensor, filter, bias tensor, and output tensor are + * as follows: + * - input, offset, mask, filter, output: \p MLUOP_LAYOUT_NHWC. + * - bias: \p MLUOP_LAYOUT_ARRAY + * + * @par Scale Limitation + * - The input, offset, mask, filter, bias, output and the deformable convolution descriptor + * (including pad, stride, dilation, deformable_group, conv_group, im2col_step) must meet the + * following requirements: + * - input tensor: \p batch > 0, \p height > 0, \p width > 0, \p channel > 0 + * - offset tensor: \p batch should be equal to the batch size of input tensor, \p height and \p width + * should be equal to the height and width of output tensor accordingly. \p channel should be equal to + deformable_group filter_height filter_width 2. + * - mask tensor: When mask is needed, \p batch should be equal to the batch size of input tensor, + * \p height and \p width should be equal to the height and width of output tensor accordingly. + \p channel should be equal to deformable_group filter_height * filter_width. + - The value of (im2col_step out_height out_filter filter_h filter_w input_channel) + * should be less than or equal to the INT_MAX defined in limits.h. + * @par API Dependency + * - Before calling this function to implement deformable convolution, you need to prepare + * all the parameters passed to this function. See each parameter description + * for details. + * + * @par Performance Optimization + * - To achieve better performance, set the im2col_step equal to the batch + * size of the input tensor. + * + * @par Note + * - The alignment of \p input, \p offset, \p mask, \p filter, \p bias, \p output, + * should be contiguous in the MLU memory. + * + * @par Requirements + * - None. + * + * @par Example + * - The example of the deformable convolution forward operation is as follows: + @verbatim + + input tensor by 1 3 3 * 2 --> input: + [[[[0.7944, 0.4922], [0.2008, 0.2081], [0.9998, 0.3053]], + [[0.1815, 0.9210], [0.8463, 0.1819], [0.9159, 0.4917]], + [[0.6668, 0.2843], [0.8364, 0.2765], [0.7150, 0.6780]]]] + offset tensor by 1 3 3 * 2 --> offset: + [[[[-0.6317, -1.4928], [-0.0696, 1.1910], [ 0.8778, 0.5145]], + [[-0.9248, -0.9889], [ 0.6157, 0.2157], [-1.1540, -0.1283]], + [[-0.5704, 1.0237], [ 0.7956, 1.1203], [-0.0129, -0.2686]]]] + mask tensor by 1 3 3 * 1 --> mask: + [[[[ 0.4581], [-1.1605], [ 0.5951]], + [[ 0.4313], [ 0.1070], [ 0.0225]], + [[ 0.7484], [ 0.6262], [ 1.1908]]]] + filter tensor by 2 1 1 * 2 --> filter: + [[[[0.8928, 0.9682]]], [[[0.9301, 0.6817]]]] + bias tensor by 2 --> bias: + [0.4356, 0.0840] + + param: + pad: (0, 0, 0, 0), stride: (1, 1), dilation: (1, 1) + + output tensor by 1 3 3 * 2 --> output: + [[[[ 0.4356, 0.0840], [-0.6024, -0.9101], [ 0.8056, 0.4252]], + [[ 0.4412, 0.0890], [ 0.5478, 0.1898], [ 0.4562, 0.1037]], + [[ 1.1652, 0.7876], [ 0.5814, 0.2109], [ 1.8874, 1.3752]]]] + @endverbatim + * + * @par Reference + * - https://github.com/msracver/Deformable-ConvNets + * - Deformable Convolutional Networks, Jifeng Dai, et al., 2017. + * - Deformable ConvNets v2: More Deformable, Better Results, Xizhou Zhu, et al., 2018. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpDCNForward(mluOpHandle_t handle, + const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, + const void *input, + const mluOpTensorDescriptor_t offset_desc, + const void *offset, + const mluOpTensorDescriptor_t mask_desc, + const void *mask, + const mluOpTensorDescriptor_t filter_desc, + const void *filter, + const mluOpTensorDescriptor_t bias_desc, + const void *bias, + void *workspace, + size_t workspace_size, + const mluOpTensorDescriptor_t output_desc, + void *output); + +// Group:DCN +/*! + * @brief Returns in \p workspace_size the size of the MLU memory that is used as an extra + * workspace to optimize the deformable convolution backward filter operation. + * + * The size of the extra workspace is determined by the deformable convolution + * backward filter operation, including the deformable convolution descriptor \p dcn_desc, + * input tensor descriptor \p input_desc, offset tensor + * descriptor \p offset_desc, mask tensor descriptor \p mask_desc, gradient with respect to + * the output tensor \p grad_output_desc, the gradient with respect to the filter tensor + * \p grad_filter_desc, and the gradient with respect to the bias tensor \p grad_bias_desc. + * For more information about the workspace, see "Cambricon MLUOP User Guide." + * + * @param[in] handle + * Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the + * deformable convolution operation. For detailed information, see ::mluOpHandle_t. + * @param[in] dcn_desc + * Input. The descriptor of the deformable convolution operation. For detailed information, see + * ::mluOpDCNDescriptor_t. + * @param[in] input_desc + * Input. The descriptor of the input tensor. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] offset_desc + * Input. The descriptor of the offset tensor. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[in] mask_desc + * Input. The descriptor of the mask tensor. Set this parameter to NULL if mask is not needed. For detailed + * information, see ::mluOpTensorDescriptor_t. + * @param[in] grad_output_desc + * Input. The descriptor of the gradient with respect to the output tensor. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] grad_filter_desc + * Input. The descriptor of the gradient with respect to the filter tensor. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] grad_bias_desc + * Input. The descriptor of the gradient with respect to the bias tensor. + * Set this parameter to NULL if the gradient with respect to bias is not needed. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[out] workspace_size + * Output. Pointer to the returned size of the extra workspace in bytes that is used in the + * deformable convolution backward filter operation. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM + * + * @par API Dependency + * - You must call the ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor functions + * to create and set the tensor descriptors \p input, \p offset, \p mask (optional), + * \p grad_output, \p grad_filter, and \p grad_bias (optional) before calling this + * function. + * - The allocated extra workspace must be passed to the ::mluOpDCNBackwardWeight function to + * perform the deformable convolution backward filter operation. + * + * @par Note + * - None. + * + * @par Requirements + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - None. + */ + +mluOpStatus_t MLUOP_WIN_API +mluOpGetDCNBackwardWeightWorkspaceSize(mluOpHandle_t handle, + const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, + const mluOpTensorDescriptor_t offset_desc, + const mluOpTensorDescriptor_t mask_desc, + const mluOpTensorDescriptor_t grad_output_desc, + const mluOpTensorDescriptor_t grad_filter_desc, + const mluOpTensorDescriptor_t grad_bias_desc, + size_t *workspace_size); + +// Group:DCN +/*! + * @brief Performs the back-propagation of a deformable convolution operation to compute + * the gradient with respect to filter \p grad_filter and bias \p grad_bias + * based on the gradient of response \p grad_output. + * + * This function needs extra MLU memory as the workspace to improve the performance. + * You can get the size of the workspace \p workspace_size with the + * ::mluOpGetDCNBackwardWeightWorkspaceSize function. + * + * @param[in] handle + * Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and + * queues in the deformable convolution backward filter operation. For detailed information, + * see ::mluOpHandle_t. + * @param[in] dcn_desc + * Input. The descriptor of the deformable convolution operation. For detailed information, + * see ::mluOpDCNDescriptor_t. + * @param[in] input_desc + * Input. The descriptor of the input tensor. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[in] input + * Input. Pointer to the MLU memory that stores the input tensor. + * @param[in] offset_desc + * Input. The descriptor of the offset to be applied to each position in the convolution kernel. + The shape of offset should be (batch, out_height, out_width, 2 deformable_group * + weight_height filter_width). For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] offset + * Input. Pointer to the MLU memory that stores the offset tensor. + * @param[in] mask_desc + * Input. The descriptor of the scaling factor to be applied to each position in the convolution + * kernel. The shape of the mask must be (batch, out_height, out_width, + deformable_group filter_height * filter_width). Set this parameter to NULL when + * mask is not requested. For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] mask + * Input. Pointer to the MLU memory that stores the mask tensor. Set this parameter to NULL when mask is not + * requested. + * @param[in] grad_output_desc + * Input. The descriptor of the gradient with respect to the output tensor. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[in] grad_output + * Input. Pointer to the MLU memory that stores the gradient with respect to the output tensor. + * @param[in] workspace + * Input. Pointer to the MLU memory that is used as an extra workspace for the + * deformable convolution backward filter operation. For more information about workspace, + * see "Cambricon MLUOP User Guide". + * @param[in] workspace_size + * Input. The size of the extra workspace in bytes needed for + * the deformable convolution backward filter operation. You can get the size of the workspace + * with the ::mluOpGetDCNBackwardWeightWorkspaceSize function. + * @param[in] grad_filter_desc + * Input. The descriptor of the gradient with respect to the filter tensor. + * For detailed information, see ::mluOpTensorDescriptor_t. + * @param[out] grad_filter + * Output. Pointer to the MLU memory that stores the gradient with respect to the filter tensor. + * @param[in] grad_bias_desc + * Input. The descriptor of the gradient with respect to the bias tensor. Set this parameter to NULL if the + * gradient of the bias tensor is not needed. For detailed information, + * see ::mluOpTensorDescriptor_t. + * @param[out] grad_bias + * Output. Pointer to the MLU memory that stores the gradient with respect to the bias tensor. + * Set this parameter to NULL if the gradient of the bias tensor is not needed. + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, + * ::MLUOP_STATUS_NOT_SUPPORTED, ::MLUOP_STATUS_NUMERICAL_OVERFLOW + * + * @par Formula + * - See "Deformable Convolution Operator" section in "Cambricon MLUOP User Guide" for details. + * + * @par Data Type + * - The off-chip data type of \p input, \p offset, \p mask, \p grad_output, \p grad_filter, + * and \p grad_bias must be the same. + * - The supported off-chip data types of the input tensor and output tensor are as follows: + * - input, offset, mask, grad_output, grad_filter, grad_bias, grad_mask: half, float. + * - This function supports any combinations of the following on-chip data types for input tensor + * \p grad_output and \p input on MLU200 series and CE3226. + * - \p grad_output on-chip data type: int16, int31. + * - \p filter on-chip data type: int16, int31. + * - \p grad_output off-chip data type can be combined with any supported on-chip data types. + * - \p input off-chip data type can be combined with any supported on-chip data types. + * - This function also supports floating-point computation on MLU300 series or above. To perform + * floating-point computation, the on-chip data type of \p input and \p grad_output should be + * \p MLUOP_DTYPE_INVALID or the same as the corresponding off-chip data type. + * + * @par Data Layout + * - The data layout of the input, offset, mask, grad_output, and grad_filter + * should be \p MLUOP_LAYOUT_NHWC. + * - The data layout of grad_bias should be \p MLUOP_LAYOUT_ARRAY. + * + * @par Scale Limitation + * - The input, offset, mask, grad_output, grad_filter, grad_bias and + * the deformable convolution descriptor + * (including pad, stride, dilation, deformable_group, conv_group, im2col_step) must meet the + * following requirements: + * - input tensor: \p batch > 0, \p height > 0, \p width > 0, \p channel > 0 + * - offset tensor: \p batch should be equal to the batch of input tensor, \p height and \p width + * should be equal to the height and width of output tensor. \p channel should be equal to + deformable_group filter_height filter_width 2. + * - mask tensor: When mask is needed, \p batch should be equal to the batch of input tensor, + * \p height and \p width should be equal to the height and width of output tensor. + \p channel should be equal to deformable_group filter_height * filter_width. + * - grad bias tensor: When the gradient of bias is needed, the \p grad_bias should be a + * one-dimensional array with the length of \p out_channel. + - The value of (im2col_step out_height out_filter filter_h filter_w input_channel) + * should be less than or equal to the INT_MAX defined in limits.h. + + * @par API Dependency + * - Before calling this function to implement the backward filter of deformable convolution, + * you need to prepare all the parameters passed to this function. See each parameter + * description for details. + * + * @par Performance Optimization + * - To achieve better performance, set the im2col_step to the batch size. + * + * @par Note + * - The alignment of \p input, \p offset, \p mask, \p grad_output, \p grad_filter, \p grad_bias + * should be contiguous in the MLU memory. + * + * @par Requirements + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - https://github.com/msracver/Deformable-ConvNets + * - Deformable Convolutional Networks, Jifeng Dai, et al., 2017. + * - Deformable ConvNets v2: More Deformable, Better Results, Xizhou Zhu, et al., 2018. + */ +mluOpStatus_t MLUOP_WIN_API +mluOpDCNBackwardWeight(mluOpHandle_t handle, + const mluOpDCNDescriptor_t dcn_desc, + const mluOpTensorDescriptor_t input_desc, + const void *input, + const mluOpTensorDescriptor_t offset_desc, + const void *offset, + const mluOpTensorDescriptor_t mask_desc, + const void *mask, + const mluOpTensorDescriptor_t grad_output_desc, + const void *grad_output, + void *workspace, + size_t workspace_size, + const mluOpTensorDescriptor_t grad_filter_desc, + void *grad_filter, + const mluOpTensorDescriptor_t grad_bias_desc, + void *grad_bias); + // Group:DCN /*! * @brief Returns in \p workspace_size the size of the MLU memory that is used as an extra @@ -12203,7 +12667,7 @@ mluOpGetDCNBakcwardDataWorkspaceSize(mluOpHandle_t handle, * @param[in] offset * Input. Pointer to the MLU memory that stores the offset tensor. * @param[in] mask_desc - * Input. The descriptor of the scaling factor to be applied for each position in the convolution + * Input. The descriptor of the scaling factor to be applied to each position in the convolution * kernel. The shape of mask must be (batch, out_height, out_width, * deformable_group * filter_height * filter_width). Set this parameter to NULL when * mask is not requested. For detailed information, see ::mluOpTensorDescriptor_t. @@ -12225,7 +12689,7 @@ mluOpGetDCNBakcwardDataWorkspaceSize(mluOpHandle_t handle, * deformable convolution backward data operation. For more information about workspace, * see "Cambricon MLU-OPS User Guide". * @param[in] workspace_size - * Input. The size of the extra workspace in bytes that needs to be used in + * Input. The size of the extra workspace in bytes needed for * the deformable convolution backward data operation. You can get the size of the workspace * with the ::mluOpGetDCNBakcwardDataWorkspaceSize function. * @param[in] grad_input_desc @@ -12265,8 +12729,8 @@ mluOpGetDCNBakcwardDataWorkspaceSize(mluOpHandle_t handle, * - \p filter onchip data type: int16, int31. * - \p grad_output offchip data type can be combined with any supported onchip data types. * - \p filter offchip data type can be combined with any supported onchip data types. - * - This function also supports float-point computation on MLU300 series or above. To perform - * float-point computation, the onchip data type of \p grad_output and \p filter must be + * - This function also supports floating-point computation on MLU300 series or above. To perform + * floating-point computation, the onchip data type of \p grad_output and \p filter must be * \p MLUOP_DTYPE_INVALID or the same as the corresponding offchip data type. * * @par Data Layout diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp new file mode 100644 index 000000000..3a807147d --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp @@ -0,0 +1,136 @@ +/************************************************************************* + * Copyright (C) [2022] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#include "transpose_cpu.h" +#include +#include "core/tensor.h" + + +template +static void transposeCpuNd(const int loop_d, T *x, T *y, const uint64_t sum, + uint64_t *dim, uint64_t *DIM, uint64_t *permute) { + for (int loop_t = 0; loop_t < loop_d; loop_t++) { + T *output = (T *)(y + sum * loop_t); + T *input = (T *)(x + sum * loop_t); + uint64_t in_index = 0, out_index = 0; + + for (dim[0] = 0; dim[0] < DIM[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < DIM[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < DIM[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < DIM[3]; dim[3]++) { + for (dim[4] = 0; dim[4] < DIM[4]; dim[4]++) { + for (dim[5] = 0; dim[5] < DIM[5]; dim[5]++) { + for (dim[6] = 0; dim[6] < DIM[6]; dim[6]++) { + for (dim[7] = 0; dim[7] < DIM[7]; dim[7]++) { + in_index = + dim[0] * DIM[1] * DIM[2] * DIM[3] * DIM[4] * DIM[5] * + DIM[6] * DIM[7] + + dim[1] * DIM[2] * DIM[3] * DIM[4] * DIM[5] * DIM[6] * + DIM[7] + + dim[2] * DIM[3] * DIM[4] * DIM[5] * DIM[6] * DIM[7] + + dim[3] * DIM[4] * DIM[5] * DIM[6] * DIM[7] + + dim[4] * DIM[5] * DIM[6] * DIM[7] + + dim[5] * DIM[6] * DIM[7] + dim[6] * DIM[7] + dim[7]; + out_index = + dim[permute[0]] * DIM[permute[1]] * DIM[permute[2]] * + DIM[permute[3]] * DIM[permute[4]] * + DIM[permute[5]] * DIM[permute[6]] * + DIM[permute[7]] + + dim[permute[1]] * DIM[permute[2]] * DIM[permute[3]] * + DIM[permute[4]] * DIM[permute[5]] * + DIM[permute[6]] * DIM[permute[7]] + + dim[permute[2]] * DIM[permute[3]] * DIM[permute[4]] * + DIM[permute[5]] * DIM[permute[6]] * + DIM[permute[7]] + + dim[permute[3]] * DIM[permute[4]] * DIM[permute[5]] * + DIM[permute[6]] * DIM[permute[7]] + + dim[permute[4]] * DIM[permute[5]] * DIM[permute[6]] * + DIM[permute[7]] + + dim[permute[5]] * DIM[permute[6]] * DIM[permute[7]] + + dim[permute[6]] * DIM[permute[7]] + dim[permute[7]]; + output[out_index] = input[in_index]; + } + } + } + } + } + } + } + } + } +} + +mluOpStatus_t mluOpTransposeCpu(const int64_t dim_desc, + const std::vector permute_desc, + const mluOpTensorDescriptor_t x_desc, + const void *x, + const mluOpTensorDescriptor_t y_desc, void *y) { + PARAM_CHECK("[cnnlTransposeCpu]", x_desc != NULL); + PARAM_CHECK("[cnnlTransposeCpu]", y_desc != NULL); + uint64_t sum = mluOpGetTensorElementNum(x_desc); + // zero elements, return success + if (sum == 0 || x_desc->dim == 0 || y_desc->dim == 0) { + VLOG(5) << "cnnlTransposeCpu:: zero elements, return success."; + return MLUOP_STATUS_SUCCESS; + } + PARAM_CHECK("[cnnlTransposeCpu]", x != NULL); + PARAM_CHECK("[cnnlTransposeCpu]", y != NULL); + + const uint64_t dim_all = dim_desc; + auto data_type = x_desc->dtype; + int loop_d = 1; + if (data_type == MLUOP_DTYPE_INT31) { + loop_d = 2; + } + // do not change the inited value(8) in permute + // 8 is used to match TRANSPOSE_MAX_DIM, which can make the loop below + // applies to all-dims transpose, from 2D transpose to 8D transpose + // if you change macro TRANSPOSE_MAX_DIM, the inited value(8) should alse be + // changed to TRANSPOSE_MAX_DIM. And the loop level should be equal to + // TRANSPOSE_MAX_DIM + uint64_t permute[TRANSPOSE_MAX_DIM] = {8, 8, 8, 8, 8, 8, 8, 8}; + uint64_t DIM[TRANSPOSE_MAX_DIM + 1] = {1, 1, 1, 1, 1, 1, 1, 1, 1}; + uint64_t dim[TRANSPOSE_MAX_DIM + 1] = {0}; + + if (x_desc->dim != dim_all || y_desc->dim != dim_all) { + LOG(ERROR) + << "cnnlTransposeCpu: dimension information mismatch, dim of x: " + << x_desc->dim << ", dim of y: " << y_desc->dim + << ", dim of descriptor: " << dim_all; + return MLUOP_STATUS_BAD_PARAM; + } + + for (int i = 0; i < dim_all; i++) { + permute[i] = permute_desc[i]; + DIM[i] = x_desc->dims[i]; + } + if (MLUOP_DTYPE_INT31 == data_type) { + transposeCpuNd(loop_d, (int16_t *)x, (int16_t *)y, sum, dim, DIM, permute); + } else if (MLUOP_DTYPE_COMPLEX_HALF == data_type || + MLUOP_DTYPE_COMPLEX_FLOAT == data_type) { + transposeCpuNd(loop_d, (double *)x, (double *)y, sum, dim, DIM, permute); + } else { + transposeCpuNd(loop_d, (float *)x, (float *)y, sum, dim, DIM, permute); + } + return MLUOP_STATUS_SUCCESS; +} diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.h b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.h new file mode 100644 index 000000000..198fb58d8 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.h @@ -0,0 +1,39 @@ +/************************************************************************* + * Copyright (C) [2022] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef TEST_MLU_OP_GTEST_SRC_INTERNAL_KERNEL_TRANSPOSE_CPU_TRANSPOSE_CPU_H_ +#define TEST_MLU_OP_GTEST_SRC_INTERNAL_KERNEL_TRANSPOSE_CPU_TRANSPOSE_CPU_H_ + +#include +#include "core/tensor.h" +#include "kernels/kernel.h" +#include "kernels/debug.h" + +#define TRANSPOSE_MAX_DIM 8 + +mluOpStatus_t mluOpTransposeCpu(const int64_t dim, + const std::vector permute, + const mluOpTensorDescriptor_t x_desc, + const void *x, + const mluOpTensorDescriptor_t y_desc, void *y); + +#endif // TEST_MLU_OP_GTEST_SRC_INTERNAL_KERNEL_TRANSPOSE_CPU_TRANSPOSE_CPU_H_ diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp new file mode 100644 index 000000000..b29e9a525 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp @@ -0,0 +1,672 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "dcn_backward_weight.h" +#include "internal_kernel/transpose_cpu/transpose_cpu.h" + +#define USE_OPENBLAS 0 + +#if USE_OPENBLAS +#include +#endif + +namespace mluoptest { +// input :[N,hi,wi,ci] +// offset :[N,ho,wo,dg*kh*kw*2] +// mask :[N,ho,wo,dg*kh*kw] // optional +// grad_ouput :[N,ho,wo,co] +// grad_weight:[co,kh,kw,ci/g] +// grad_bias :[co] // optional +static inline bool isFixData(mluOpDataType_t type) { + if (MLUOP_DTYPE_INT8 == type || MLUOP_DTYPE_INT16 == type || + MLUOP_DTYPE_INT31 == type) { + return true; + } + return false; +} + +int DcnBackwardWeightExecutor::getCoefficientOfLT2CT() { + auto input_dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).dtype()); + int lt_compute_force = 0; + int ct_compute_force = input_dtype == MLUOP_DTYPE_FLOAT ? 32 : 64; + if (input_dtype == MLUOP_DTYPE_FLOAT) { + lt_compute_force = 2 * 1.5 * 1024; + } else { + lt_compute_force = 2 * 0.375 * 1024; + } + return lt_compute_force / ct_compute_force; +} + +void DcnBackwardWeightExecutor::paramCheck() { + if (parser_->getInputNum() != 3 && parser_->getInputNum() != 4) { + LOG(ERROR) << "DCN_Backward_Weight: tensor input number is wrong."; + } + + auto dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).onchip_dtype()); + input_onchip_dtype = dtype; + if (isFixData(dtype)) { + parser_->input(0)->oc_dt = MLUOP_DTYPE_INVALID; + } + + if (parser_->getInputNum() == 3) { + dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(2).onchip_dtype()); + grad_output_onchip_dtype = dtype; + if (isFixData(dtype)) { + parser_->input(2)->oc_dt = MLUOP_DTYPE_INVALID; + } + } else { + dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(3).onchip_dtype()); + grad_output_onchip_dtype = dtype; + if (isFixData(dtype)) { + parser_->input(3)->oc_dt = MLUOP_DTYPE_INVALID; + } + } + + if (!parser_->getProtoNode()->has_dcn_param()) { + LOG(ERROR) << "Missing dcn param. "; + } + + if (parser_->getOutputNum() != 1 && parser_->getOutputNum() != 2) { + LOG(ERROR) << "DCN_Backward_Weight tensor output number is wrong."; + } + TensorLayout input_order = parser_->getProtoNode()->input(0).layout(); + if (input_order != LAYOUT_NHWC) { + LOG(ERROR) << "DCN_Backward_Weight input tensor layout should be NHWC."; + } + + int N = parser_->getProtoNode()->input(0).shape().dims(0); + int ci = parser_->getProtoNode()->input(0).shape().dims(3); + int co = parser_->getProtoNode()->output(0).shape().dims(0); + + auto dcn_param = parser_->getProtoNode()->dcn_param(); + dimnb = dcn_param.dimnb(); + for (int i = 0; i < dcn_param.pad_size(); ++i) { + pad[i] = dcn_param.pad(i); + } + for (int i = 0; i < dcn_param.stride_size(); ++i) { + stride[i] = dcn_param.stride(i); + } + for (int i = 0; i < dcn_param.dilation_size(); ++i) { + dilation[i] = dcn_param.dilation(i); + } + if (dcn_param.has_deformable_group()) { + dg = dcn_param.deformable_group(); + } + if (dcn_param.has_conv_group()) { + g = dcn_param.conv_group(); + } + if (dcn_param.has_im2col_step()) { + im2col_step = dcn_param.im2col_step(); + } + + if (dimnb != 4) { + LOG(ERROR) << "[DCN_Backward_Weight]: dimnb should be 4."; + } + + if (ci % dg) { + LOG(ERROR) << "[DCN_Backward_Weight]: deformable_group is wrong."; + } + + if (ci % g) { + LOG(ERROR) << "[DCN_Backward_Weight]: conv_group is wrong."; + } + + if (co % g) { + LOG(ERROR) << "[DCN_Backward_Weight]: conv_group is wrong."; + } + + if (N % im2col_step) { + LOG(ERROR) << "[DCN_Backward_Weight]: im2col_step is wrong."; + } +} + +void DcnBackwardWeightExecutor::workspaceMalloc() { + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + mluOpDataType_t compute_type; + auto dcn_param = parser_->getProtoNode()->dcn_param(); + if (dcn_param.has_compute_type()) { + compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type()); + } else { + compute_type = MLUOP_DTYPE_FLOAT; + } + + mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate( + mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor); + + MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg, + g, im2col_step, compute_type)); + + if (parser_->getInputNum() == 3) { + mask_desc = nullptr; + grad_output_desc = tensor_desc_[2].tensor; + grad_weight_desc = tensor_desc_[3].tensor; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor; + } else { + mask_desc = tensor_desc_[2].tensor; + grad_output_desc = tensor_desc_[3].tensor; + grad_weight_desc = tensor_desc_[4].tensor; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor; + } + + input_desc->onchip_dtype = input_onchip_dtype; + grad_output_desc->onchip_dtype = grad_output_onchip_dtype; + MLUOP_CHECK(mluOpGetDCNBackwardWeightWorkspaceSize( + handle_, dcn_desc, input_desc, offset_desc, mask_desc, grad_output_desc, + grad_weight_desc, grad_bias_desc, &workspace_size)); + + if (workspace_size != 0) { + workspace = mlu_runtime_.allocate(workspace_size); + } + + eva_->setMluWorkspaceSize(workspace_size); + cpu_runtime_.deallocate(dcn_desc); +} + +void DcnBackwardWeightExecutor::workspaceFree() { + if (workspace != nullptr) { + mlu_runtime_.deallocate(workspace); + } +} + +void DcnBackwardWeightExecutor::compute() { + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + mluOpDataType_t compute_type; + auto dcn_param = parser_->getProtoNode()->dcn_param(); + if (dcn_param.has_compute_type()) { + compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type()); + } else { + compute_type = input_desc->dtype; + } + + mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate( + mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor); + + MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg, + g, im2col_step, compute_type)); + + input = data_vector_[0].device_ptr; + offset = data_vector_[1].device_ptr; + if (parser_->getInputNum() == 3) { + mask_desc = nullptr; + mask = nullptr; + grad_output_desc = tensor_desc_[2].tensor; + grad_output = data_vector_[2].device_ptr; + grad_weight_desc = tensor_desc_[3].tensor; + grad_weight = data_vector_[3].device_ptr; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor; + grad_bias = + parser_->getOutputNum() == 1 ? nullptr : data_vector_[4].device_ptr; + } else { + mask_desc = tensor_desc_[2].tensor; + mask = data_vector_[2].device_ptr; + grad_output_desc = tensor_desc_[3].tensor; + grad_output = data_vector_[3].device_ptr; + grad_weight_desc = tensor_desc_[4].tensor; + grad_weight = data_vector_[4].device_ptr; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor; + grad_bias = + parser_->getOutputNum() == 1 ? nullptr : data_vector_[5].device_ptr; + } + + input_desc->onchip_dtype = input_onchip_dtype; + grad_output_desc->onchip_dtype = grad_output_onchip_dtype; + + VLOG(4) << "call mluOpDCNBackwardWeight()"; + interface_timer_.start(); + MLUOP_CHECK(mluOpDCNBackwardWeight( + handle_, dcn_desc, input_desc, input, offset_desc, offset, mask_desc, + mask, grad_output_desc, grad_output, workspace, workspace_size, + grad_weight_desc, grad_weight, grad_bias_desc, grad_bias)); + + interface_timer_.stop(); + cpu_runtime_.deallocate(dcn_desc); +} + +static float bilinear(float *input_ptr, const int &ci_offset, const int &hi, + const int &wi, const int &ci, const float &h_in, + const float &w_in) { + int h_low = floor(h_in); + int w_low = floor(w_in); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_in - h_low; + float lw = w_in - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + float v1 = 0, v2 = 0, v3 = 0, v4 = 0; + + if (h_low >= 0 && w_low >= 0) { + v1 = input_ptr[(h_low * wi + w_low) * ci + ci_offset]; + } + + if (h_low >= 0 && w_high <= wi - 1) { + v2 = input_ptr[(h_low * wi + w_high) * ci + ci_offset]; + } + + if (h_high <= hi - 1 && w_low >= 0) { + v3 = input_ptr[(h_high * wi + w_low) * ci + ci_offset]; + } + + if (h_high <= hi - 1 && w_high <= wi - 1) { + v4 = input_ptr[(h_high * wi + w_high) * ci + ci_offset]; + } + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + float val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + return val; +} + +static void im2col(const int &N, const int &im2col_step, const int &dg, + const int &hi, const int &wi, const int &ci, const int &ho, + const int &wo, const int &co, const int &kh, const int &kw, + const int &pt, const int &pb, const int &pl, const int &pr, + const int &sh, const int &sw, const int &dh, const int &dw, + const float *cpu_input, const float *cpu_offset, + const float *cpu_mask, float *buffer) { + // input :[N,hi,wi,ci] + // offset :[N,ho,wo,dg*kh*kw*2] + // mask :[N,ho,wo,dg*kh*kw] // optional + // grad_ouput :[N,ho,wo,co] + // grad_weight:[co,kh,kw,ci/g] + // grad_bias :[co] // optional + for (int idx_n = 0; idx_n < im2col_step; ++idx_n) { + for (int idx_ho = 0; idx_ho < ho; ++idx_ho) { + for (int idx_wo = 0; idx_wo < wo; ++idx_wo) { + float *input_ptr = (float *)cpu_input + idx_n * hi * wi * ci; + float *offset_ptr = + (float *)cpu_offset + + ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw * 2; + float *mask_ptr = + cpu_mask != nullptr + ? (float *)cpu_mask + + ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw + : nullptr; + float *columns_ptr = + (float *)buffer + + ((idx_n * ho + idx_ho) * wo + idx_wo) * kh * kw * ci; + const int hi_start = idx_ho * sh - pt; + const int wi_start = idx_wo * sw - pl; + for (int idx_kh = 0; idx_kh < kh; ++idx_kh) { + for (int idx_kw = 0; idx_kw < kw; ++idx_kw) { + for (int idx_dg = 0; idx_dg < dg; ++idx_dg) { + const int data_offset_h = + ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2; + const int data_offset_w = + ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2 + 1; + const int data_mask = (idx_dg * kh + idx_kh) * kw + idx_kw; + const float offset_h = offset_ptr[data_offset_h]; + const float offset_w = offset_ptr[data_offset_w]; + const float mask = + mask_ptr != nullptr ? mask_ptr[data_mask] : 1.0f; + const float h_in = hi_start + idx_kh * dh + offset_h; + const float w_in = wi_start + idx_kw * dw + offset_w; + if (h_in > -1 && w_in > -1 && h_in < hi && w_in < wi) { + for (int idx_ci = 0; idx_ci < ci / dg; ++idx_ci) { + const int ci_offset = idx_dg * ci / dg + idx_ci; + const int columns_offset = + (idx_kh * kw + idx_kw) * ci + ci_offset; + columns_ptr[columns_offset] = + bilinear(input_ptr, ci_offset, hi, wi, ci, h_in, w_in) * + mask; + } + } + } + } + } + } + } + } +} + +void DcnBackwardWeightExecutor::transpose(float *input, float *output, + const int dims[], const int dim_num, + int permute[]) { + int64_t dim_desc = dim_num; + std::vector permute_desc; + if (dim_desc > 8 || dim_desc <= 0) { + LOG(ERROR) << "dim_desc is " << dim_desc + << ", it shoule less than 8 and greater than 0"; + } + { std::vector().swap(permute_desc); } + for (int i = 0; i < dim_num; i++) { + permute_desc.push_back(permute[i]); + } + mluOpTensorDescriptor_t input_desc, output_desc; + input_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor, + mluOpDestroyTensorDescriptor); + output_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor, + mluOpDestroyTensorDescriptor); + + int dims_trans[4]; + for (int i = 0; i < dim_num; ++i) { + dims_trans[i] = dims[permute[i]]; + } + + MLUOP_CHECK(mluOpSetTensorDescriptor(input_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, dim_num, dims)); + MLUOP_CHECK(mluOpSetTensorDescriptor(output_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, dim_num, dims_trans)); + + MLUOP_CHECK(mluOpTransposeCpu(dim_desc, permute_desc, input_desc, input, + output_desc, output)); + cpu_runtime_.deallocate(input_desc); + cpu_runtime_.deallocate(output_desc); +} + +static void BatchMatMul(const int &g, const int &m, const int &k, const int &n, + float *input_a, float *input_b, float *output, + bool is_transa, bool is_transb) { + const int batch_size = g; + + assert(batch_size >= 1); +#if USE_OPENBLAS + const CBLAS_ORDER Order = CblasRowMajor; + const CBLAS_TRANSPOSE TransA = is_transa ? CblasTrans : CblasNoTrans; + const CBLAS_TRANSPOSE TransB = is_transb ? CblasTrans : CblasNoTrans; + + int lda = is_transa ? m : k; + int ldb = is_transb ? k : n; + int ldc = n; + + float alpha = 1.0f; + float beta = 1.0f; +#else + auto matmul = [](float *lhs, float *rhs, float *output, bool is_trans_a, + bool is_trans_b, int M, int N, int K) { + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + // output[m * N + n] = 0.0f; + for (int k = 0; k < K; k++) { + int lhs_idx = m * K + k; + if (is_trans_a) lhs_idx = k * M + m; + int rhs_idx = k * N + n; + if (is_trans_b) rhs_idx = n * K + k; + output[m * N + n] += lhs[lhs_idx] * rhs[rhs_idx]; + } + } + } + }; +#endif + for (int i = 0; i < batch_size; ++i) { +#if USE_OPENBLAS + cblas_sgemm(Order, TransA, TransB, m, n, k, alpha, input_a + i * m * k, lda, + input_b + i * k * n, ldb, beta, output + i * m * n, ldc); +#else + matmul(input_a + i * m * k, input_b + i * k * n, output + i * m * n, + is_transa, is_transb, m, n, k); +#endif + } +} + +static void dealBias(float *cpu_grad_output, float *cpu_grad_bias, const int &N, + const int &ho, const int &wo, const int &co) { + for (int idx_n = 0; idx_n < N; ++idx_n) { + for (int idx_ho = 0; idx_ho < ho; ++idx_ho) { + for (int idx_wo = 0; idx_wo < wo; ++idx_wo) { + for (int idx_co = 0; idx_co < co; ++idx_co) { + cpu_grad_bias[idx_co] += + cpu_grad_output[((idx_n * ho + idx_ho) * wo + idx_wo) * co + + idx_co]; + } + } + } + } +} + +void DcnBackwardWeightExecutor::computeDCNBackwardWeightCPU( + const int &dg, const int &g, const int &im2col_step, + const mluOpTensorDescriptor_t input_desc, const void *cpu_input, + const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset, + const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask, + const mluOpTensorDescriptor_t grad_output_desc, const void *cpu_grad_output, + const mluOpTensorDescriptor_t grad_weight_desc, void *cpu_grad_weight, + const mluOpTensorDescriptor_t grad_bias_desc, void *cpu_grad_bias, + float *buffer, int pad[], int stride[], int dilation[], + int64_t &theory_ops) { + const int N = input_desc->dims[0]; + const int hi = input_desc->dims[1]; + const int wi = input_desc->dims[2]; + const int ci = input_desc->dims[3]; + const int ho = offset_desc->dims[1]; + const int wo = offset_desc->dims[2]; + const int co = grad_output_desc->dims[3]; + const int kh = grad_weight_desc->dims[1]; + const int kw = grad_weight_desc->dims[2]; + const int pt = pad[0]; + const int pb = pad[1]; + const int pl = pad[2]; + const int pr = pad[3]; + const int sh = stride[0]; + const int sw = stride[1]; + const int dh = dilation[0]; + const int dw = dilation[1]; + + int coeff = getCoefficientOfLT2CT(); + if (g == 1) { + // buffer: | columns_a | + for (int i = 0; i < N / im2col_step; ++i) { + float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci; + float *offset_i = + (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2; + float *mask_i = + cpu_mask != nullptr + ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw + : nullptr; + float *grad_output_i = + (float *)cpu_grad_output + i * im2col_step * ho * wo * co; + // 1.im2col + memset(buffer, 0, (im2col_step * ho * wo * kh * kw * ci) * sizeof(float)); + im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr, + sh, sw, dh, dw, input_i, offset_i, mask_i, (float *)buffer); + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci * + 15; // bilinear(14) + mask(1) + + float *input_a = grad_output_i; + float *input_b = buffer; + const int k = im2col_step * ho * wo; + const int m = co; + const int n = kh * kw * ci; + // 2.BMM + BatchMatMul(g, m, k, n, input_a, input_b, (float *)cpu_grad_weight, true, + false); + theory_ops += 2 * (int64_t)g * m * k * n / coeff; // lt2ct + } + } else { + // | columns_a | columns_b | grad_output | + float *buffer_columns_a = buffer; + float *buffer_columns_b = + buffer_columns_a + im2col_step * ho * wo * kh * kw * ci; + float *buffer_grad_output = + buffer_columns_b + im2col_step * ho * wo * kh * kw * ci; + for (int i = 0; i < N / im2col_step; ++i) { + float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci; + float *offset_i = + (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2; + float *mask_i = + cpu_mask != nullptr + ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw + : nullptr; + float *grad_output_i = + (float *)cpu_grad_output + i * im2col_step * ho * wo * co; + // 1.im2col + memset(buffer, 0, (im2col_step * ho * wo * kh * kw * ci) * sizeof(float)); + im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr, + sh, sw, dh, dw, input_i, offset_i, mask_i, buffer_columns_a); + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci * + 15; // bilinear_count + mask + + // 2.split columns [im2col_step*ho*wo*kh*kw,g, + // ci/g]->[g,im2col_step*ho*wo*kh*kw,ci/g] + int dims_1[3] = {im2col_step * ho * wo * kh * kw, g, ci / g}; + int permute_1[3] = {1, 0, 2}; + transpose(buffer_columns_a, buffer_columns_b, dims_1, 3, permute_1); + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci; + + // 3.transpose grad_output [im2col_step*ho*wo,co]-> + // [g,co/g,im2col_step*ho*wo] + int dims_2[2] = {im2col_step * ho * wo, co}; + int permute_2[2] = {1, 0}; + transpose(grad_output_i, buffer_grad_output, dims_2, 2, permute_2); + theory_ops += (int64_t)im2col_step * ho * wo * co; + + float *input_a = buffer_grad_output; + float *input_b = buffer_columns_b; + const int k = im2col_step * ho * wo; + const int m = co / g; + const int n = kh * kw * ci / g; + + // 4.BMM + BatchMatMul(g, m, k, n, input_a, input_b, (float *)cpu_grad_weight, false, + false); + theory_ops += 2 * (int64_t)g * m * k * n / coeff; // lt2ct + } + } + // 5.grad_bias + if (cpu_grad_bias) { + dealBias((float *)cpu_grad_output, (float *)cpu_grad_bias, N, ho, wo, co); + theory_ops += (int64_t)N * ho * wo * co; + } +} + +void DcnBackwardWeightExecutor::cpuCompute() { + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + cpu_input = cpu_fp32_input_[0]; + cpu_offset = cpu_fp32_input_[1]; + if (parser_->getInputNum() == 3) { + mask_desc = nullptr; + cpu_mask = nullptr; + grad_output_desc = tensor_desc_[2].tensor; + cpu_grad_output = cpu_fp32_input_[2]; + + grad_weight_desc = tensor_desc_[3].tensor; + cpu_grad_weight = cpu_fp32_output_[0]; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor; + cpu_grad_bias = + parser_->getOutputNum() == 1 ? nullptr : cpu_fp32_output_[1]; + } else { + mask_desc = tensor_desc_[2].tensor; + cpu_mask = cpu_fp32_input_[2]; + grad_output_desc = tensor_desc_[3].tensor; + cpu_grad_output = cpu_fp32_input_[3]; + grad_weight_desc = tensor_desc_[4].tensor; + cpu_grad_weight = cpu_fp32_output_[0]; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor; + cpu_grad_bias = + parser_->getOutputNum() == 1 ? nullptr : cpu_fp32_output_[1]; + } + + const int ho = offset_desc->dims[1]; + const int wo = offset_desc->dims[2]; + const int kh = grad_weight_desc->dims[1]; + const int kw = grad_weight_desc->dims[2]; + const int ci = input_desc->dims[3]; + const int co = grad_output_desc->dims[3]; + + size_t cpu_buffer_size = 0; + if (g == 1) { + cpu_buffer_size = + (static_cast(im2col_step) * ho * wo * kh * kw * ci) * + sizeof(float); + } else { + cpu_buffer_size = (2lu * im2col_step * ho * wo * kh * kw * ci + + im2col_step * ho * wo * co) * + sizeof(float); + } + + float *buffer = nullptr; + buffer = (float *)cpu_runtime_.allocate(cpu_buffer_size); + if (buffer == nullptr) { + LOG(ERROR) << "dcn_backward_weight: allocate buffer failed."; + } + if (cpu_grad_weight) { + memset(cpu_grad_weight, 0, co * kh * kw * ci / g * sizeof(float)); + } + if (cpu_grad_bias) { + memset(cpu_grad_bias, 0, co * sizeof(float)); + } + theory_ops = 0; + computeDCNBackwardWeightCPU( + dg, g, im2col_step, input_desc, cpu_input, offset_desc, cpu_offset, + mask_desc, cpu_mask, grad_output_desc, cpu_grad_output, grad_weight_desc, + cpu_grad_weight, grad_bias_desc, cpu_grad_bias, buffer, pad, stride, + dilation, theory_ops); + + cpu_runtime_.deallocate(buffer); +} + +int64_t DcnBackwardWeightExecutor::getTheoryOps() { + if (exe_config_->mlu_only) { + theory_ops = 0; + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + if (parser_->getInputNum() == 3) { + grad_output_desc = tensor_desc_[2].tensor; + grad_weight_desc = tensor_desc_[3].tensor; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor; + } else { + grad_output_desc = tensor_desc_[3].tensor; + grad_weight_desc = tensor_desc_[4].tensor; + grad_bias_desc = + parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor; + } + const int N = input_desc->dims[0]; + const int hi = input_desc->dims[1]; + const int wi = input_desc->dims[2]; + const int ci = input_desc->dims[3]; + const int ho = offset_desc->dims[1]; + const int wo = offset_desc->dims[2]; + const int co = grad_output_desc->dims[3]; + const int kh = grad_weight_desc->dims[1]; + const int kw = grad_weight_desc->dims[2]; + int coeff = getCoefficientOfLT2CT(); + const int k = im2col_step * ho * wo; + const int m = co / g; + const int n = kh * kw * ci / g; + if (g == 1) { + for (int i = 0; i < N / im2col_step; ++i) { + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci * + 15; // bilinear(14) + mask(1) + theory_ops += 2 * (int64_t)g * m * k * n / coeff; // lt2ct + } + } else { + for (int i = 0; i < N / im2col_step; ++i) { + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci * + 15; // bilinear_count + mask + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci; + theory_ops += (int64_t)im2col_step * ho * wo * co; + theory_ops += 2 * (int64_t)g * m * k * n / coeff; // lt2ct + } + } + if (grad_bias_desc) { + theory_ops += (int64_t)N * ho * wo * co; + } + } + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.h b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.h new file mode 100755 index 000000000..a193499d5 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.h @@ -0,0 +1,86 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef TEST_MLUOP_GTEST_SRC_ZOO_DCN_BACKWARD_WEIGHT_DCN_BACKWARD_WEIGHT_H_ +#define TEST_MLUOP_GTEST_SRC_ZOO_DCN_BACKWARD_WEIGHT_DCN_BACKWARD_WEIGHT_H_ + +#include +#include "executor.h" + +namespace mluoptest { + +class DcnBackwardWeightExecutor : public Executor { + public: + DcnBackwardWeightExecutor() {} + ~DcnBackwardWeightExecutor() {} + + void workspaceMalloc(); + void workspaceFree(); + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; + + private: + void transpose(float *input, float *output, const int dims[], + const int dim_num, int permute[]); + int getCoefficientOfLT2CT(); + void computeDCNBackwardWeightCPU( + const int &dg, const int &g, const int &im2col_step, + const mluOpTensorDescriptor_t input_desc, const void *cpu_input, + const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset, + const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask, + const mluOpTensorDescriptor_t grad_output_desc, + const void *cpu_grad_output, + const mluOpTensorDescriptor_t grad_weight_desc, void *cpu_grad_weight, + const mluOpTensorDescriptor_t grad_bias_desc, void *cpu_grad_bias, + float *buffer, int pad[], int stride[], int dilation[], + int64_t &theory_ops); + + mluOpDataType_t input_onchip_dtype; + mluOpDataType_t grad_output_onchip_dtype; + + mluOpTensorDescriptor_t input_desc; + mluOpTensorDescriptor_t offset_desc; + mluOpTensorDescriptor_t mask_desc = nullptr; // optional + mluOpTensorDescriptor_t grad_output_desc; + mluOpTensorDescriptor_t grad_weight_desc; + mluOpTensorDescriptor_t grad_bias_desc = nullptr; // optional + + int dimnb; + int pad[4]; + int stride[2]; + int dilation[2]; + int dg; + int g; + int im2col_step; + + void *input = nullptr; + void *offset = nullptr; + void *mask = nullptr; + void *grad_output = nullptr; + void *grad_weight = nullptr; + void *grad_bias = nullptr; + + void *cpu_input = nullptr; + void *cpu_offset = nullptr; + void *cpu_mask = nullptr; + void *cpu_grad_output = nullptr; + void *cpu_grad_weight = nullptr; + void *cpu_grad_bias = nullptr; + + void *workspace = nullptr; + size_t workspace_size = 0; + int64_t theory_ops = 0; +}; + +} // namespace mluoptest +#endif // TEST_MLUOP_GTEST_SRC_ZOO_DCN_BACKWARD_WEIGHT_DCN_BACKWARD_WEIGHT_H_ diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/test_case/case_hi_16.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/test_case/case_hi_16.prototxt new file mode 100755 index 000000000..1f57195e6 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/test_case/case_hi_16.prototxt @@ -0,0 +1,112 @@ +op_name: "dcn_backward_weight" +op_type: DCN_BACKWARD_WEIGHT +input { + id: "input" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 300 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 2.4 + lower_bound: -2.2 + distribution: UNIFORM + } +} +input { + id: "offset" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 36 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 3.4 + lower_bound: -2.9 + distribution: UNIFORM + } +} +input { + id: "mask" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 18 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 1 + lower_bound: 0 + distribution: UNIFORM + } +} + +input { + id: "grad_output" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 300 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 1 + lower_bound: -1 + distribution: UNIFORM + } +} +output { + id: "grad_weight" + shape: { + dims: 300 + dims: 3 + dims: 3 + dims: 100 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT +} +output { + id: "grad_bias" + shape: { + dims: 300 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +dcn_param: { + dimnb: 4 + pad: 1 + pad: 1 + pad: 1 + pad: 1 + stride: 1 + stride: 1 + dilation: 1 + dilation: 1 + deformable_group: 2 + conv_group: 3 + im2col_step: 1 + compute_type: 2 +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp new file mode 100644 index 000000000..9a210931d --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp @@ -0,0 +1,701 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "dcn_forward.h" +#include "internal_kernel/transpose_cpu/transpose_cpu.h" + +#define USE_OPENBLAS 0 + +#if USE_OPENBLAS +#include +#endif + +namespace mluoptest { +// input :[N,hi,wi,ci] +// offset:[N,ho,wo,dg*kh*kw*2] +// mask :[N,ho,wo,dg*kh*kw] // optional +// weight:[co,kh,kw,ci/g] +// bias :[co] // optional +// ouput :[N,ho,wo,co] + +int DcnForwardExecutor::getCoefficientOfLT2CT() { + auto input_dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).dtype()); + int lt_compute_force = 0; + int ct_compute_force = input_dtype == MLUOP_DTYPE_FLOAT ? 32 : 64; + if (input_dtype == MLUOP_DTYPE_FLOAT) { + lt_compute_force = 2 * 1.5 * 1024; + } else { + lt_compute_force = 2 * 0.375 * 1024; + } + return lt_compute_force / ct_compute_force; +} + +void DcnForwardExecutor::paramCheck() { + if (parser_->getInputNum() != 3 && parser_->getInputNum() != 4 && + parser_->getInputNum() != 5) { + LOG(ERROR) << "DCN_Forward tensor input number is wrong."; + } + + auto dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).onchip_dtype()); + input_onchip_dtype = dtype; + + if (parser_->getInputNum() == 3 || + parser_->getProtoNode()->input(3).shape().dims_size() == 1) { + dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(2).onchip_dtype()); + weight_onchip_dtype = dtype; + } else { + dtype = + cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(3).onchip_dtype()); + weight_onchip_dtype = dtype; + } + + if (!parser_->getProtoNode()->has_dcn_param()) { + LOG(ERROR) << "Missing dcn param. "; + } + + if (parser_->getOutputNum() != 1) { + LOG(ERROR) << "DCN_Forward tensor output number is wrong."; + } + TensorLayout input_order = parser_->getProtoNode()->input(0).layout(); + if (input_order != LAYOUT_NHWC) { + LOG(ERROR) << "DCN_Forward input tensor layout should be NHWC."; + } + + int N = parser_->getProtoNode()->input(0).shape().dims(0); + int ci = parser_->getProtoNode()->input(0).shape().dims(3); + int co = parser_->getProtoNode()->output(0).shape().dims(3); + + auto dcn_param = parser_->getProtoNode()->dcn_param(); + dimnb = dcn_param.dimnb(); + for (int i = 0; i < dcn_param.pad_size(); ++i) { + pad[i] = dcn_param.pad(i); + } + for (int i = 0; i < dcn_param.stride_size(); ++i) { + stride[i] = dcn_param.stride(i); + } + for (int i = 0; i < dcn_param.dilation_size(); ++i) { + dilation[i] = dcn_param.dilation(i); + } + if (dcn_param.has_deformable_group()) { + dg = dcn_param.deformable_group(); + } + if (dcn_param.has_conv_group()) { + g = dcn_param.conv_group(); + } + if (dcn_param.has_im2col_step()) { + im2col_step = dcn_param.im2col_step(); + } + + if (dimnb != 4) { + LOG(ERROR) << "[DCN_Forward]: dimnb should be 4."; + } + + if (ci % dg) { + LOG(ERROR) << "[DCN_Forward]: deformable_group is wrong."; + } + + if (ci % g) { + LOG(ERROR) << "[DCN_Forward]: conv_group is wrong."; + } + + if (co % g) { + LOG(ERROR) << "[DCN_Forward]: conv_group is wrong."; + } + + if (N % im2col_step) { + LOG(ERROR) << "[DCN_Forward]: im2col_step is wrong."; + } +} + +void DcnForwardExecutor::workspaceMalloc() { + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + mluOpDataType_t compute_type; + auto dcn_param = parser_->getProtoNode()->dcn_param(); + if (dcn_param.has_compute_type()) { + compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type()); + } else { + compute_type = MLUOP_DTYPE_FLOAT; + } + mluOpDataType_t compute_type1 = compute_type; + mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate( + mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor); + MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg, + g, im2col_step, compute_type1)); + + if (parser_->getInputNum() == 3) { + mask_desc = nullptr; + weight_desc = tensor_desc_[2].tensor; + bias_desc = nullptr; + output_desc = tensor_desc_[3].tensor; + } else if (parser_->getInputNum() == 4) { + if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) { + mask_desc = tensor_desc_[2].tensor; + weight_desc = tensor_desc_[3].tensor; + bias_desc = nullptr; + output_desc = tensor_desc_[4].tensor; + } else { + mask_desc = nullptr; + weight_desc = tensor_desc_[2].tensor; + bias_desc = tensor_desc_[3].tensor; + output_desc = tensor_desc_[4].tensor; + } + } else { + mask_desc = tensor_desc_[2].tensor; + weight_desc = tensor_desc_[3].tensor; + bias_desc = tensor_desc_[4].tensor; + output_desc = tensor_desc_[5].tensor; + } + + input_desc->onchip_dtype = input_onchip_dtype; + weight_desc->onchip_dtype = weight_onchip_dtype; + + MLUOP_CHECK(mluOpGetDCNForwardWorkspaceSize( + handle_, dcn_desc, input_desc, offset_desc, mask_desc, weight_desc, + bias_desc, output_desc, &workspace_size)); + + if (workspace_size != 0) { + workspace = mlu_runtime_.allocate(workspace_size); + } + + eva_->setMluWorkspaceSize(workspace_size); + cpu_runtime_.deallocate(dcn_desc); +} + +void DcnForwardExecutor::workspaceFree() { + if (workspace != nullptr) { + mlu_runtime_.deallocate(workspace); + } +} + +void DcnForwardExecutor::compute() { + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + mluOpDataType_t compute_type; + auto dcn_param = parser_->getProtoNode()->dcn_param(); + if (dcn_param.has_compute_type()) { + compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type()); + } else { + compute_type = MLUOP_DTYPE_FLOAT; + } + mluOpDataType_t compute_type2 = compute_type; + mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate( + mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor); + MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg, + g, im2col_step, compute_type2)); + input = data_vector_[0].device_ptr; + offset = data_vector_[1].device_ptr; + if (parser_->getInputNum() == 3) { + mask_desc = nullptr; + mask = nullptr; + weight_desc = tensor_desc_[2].tensor; + weight = data_vector_[2].device_ptr; + bias_desc = nullptr; + bias = nullptr; + output_desc = tensor_desc_[3].tensor; + output = data_vector_[3].device_ptr; + } else if (parser_->getInputNum() == 4) { + if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) { + mask_desc = tensor_desc_[2].tensor; + mask = data_vector_[2].device_ptr; + weight_desc = tensor_desc_[3].tensor; + weight = data_vector_[3].device_ptr; + bias_desc = nullptr; + bias = nullptr; + output_desc = tensor_desc_[4].tensor; + output = data_vector_[4].device_ptr; + } else { + mask_desc = nullptr; + mask = nullptr; + weight_desc = tensor_desc_[2].tensor; + weight = data_vector_[2].device_ptr; + bias_desc = tensor_desc_[3].tensor; + bias = data_vector_[3].device_ptr; + output_desc = tensor_desc_[4].tensor; + output = data_vector_[4].device_ptr; + } + } else { + mask_desc = tensor_desc_[2].tensor; + mask = data_vector_[2].device_ptr; + weight_desc = tensor_desc_[3].tensor; + weight = data_vector_[3].device_ptr; + bias_desc = tensor_desc_[4].tensor; + bias = data_vector_[4].device_ptr; + output_desc = tensor_desc_[5].tensor; + output = data_vector_[5].device_ptr; + } + + input_desc->onchip_dtype = input_onchip_dtype; + weight_desc->onchip_dtype = weight_onchip_dtype; + VLOG(4) << "call mluOpDCNForward()"; + interface_timer_.start(); + + MLUOP_CHECK(mluOpDCNForward(handle_, dcn_desc, input_desc, input, offset_desc, + offset, mask_desc, mask, weight_desc, weight, + bias_desc, bias, workspace, workspace_size, + output_desc, output)); + + interface_timer_.stop(); + cpu_runtime_.deallocate(dcn_desc); +} + +static float bilinear(float *input_ptr, const int &ci_offset, const int &hi, + const int &wi, const int &ci, const float &h_in, + const float &w_in) { + int h_low = floor(h_in); + int w_low = floor(w_in); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_in - h_low; + float lw = w_in - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + float v1 = 0, v2 = 0, v3 = 0, v4 = 0; + + if (h_low >= 0 && w_low >= 0) { + v1 = input_ptr[(h_low * wi + w_low) * ci + ci_offset]; + } + + if (h_low >= 0 && w_high <= wi - 1) { + v2 = input_ptr[(h_low * wi + w_high) * ci + ci_offset]; + } + + if (h_high <= hi - 1 && w_low >= 0) { + v3 = input_ptr[(h_high * wi + w_low) * ci + ci_offset]; + } + + if (h_high <= hi - 1 && w_high <= wi - 1) { + v4 = input_ptr[(h_high * wi + w_high) * ci + ci_offset]; + } + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + float val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + return val; +} + +static void im2col(const int &N, const int &im2col_step, const int &dg, + const int &hi, const int &wi, const int &ci, const int &ho, + const int &wo, const int &co, const int &kh, const int &kw, + const int &pt, const int &pb, const int &pl, const int &pr, + const int &sh, const int &sw, const int &dh, const int &dw, + const float *cpu_input, const float *cpu_offset, + const float *cpu_mask, float *buffer) { + for (int idx_n = 0; idx_n < im2col_step; ++idx_n) { + for (int idx_ho = 0; idx_ho < ho; ++idx_ho) { + for (int idx_wo = 0; idx_wo < wo; ++idx_wo) { + float *input_ptr = (float *)cpu_input + idx_n * hi * wi * ci; + float *offset_ptr = + (float *)cpu_offset + + ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw * 2; + float *mask_ptr = + cpu_mask != nullptr + ? (float *)cpu_mask + + ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw + : nullptr; + float *columns_ptr = + (float *)buffer + + ((idx_n * ho + idx_ho) * wo + idx_wo) * kh * kw * ci; + const int hi_start = idx_ho * sh - pt; + const int wi_start = idx_wo * sw - pl; + for (int idx_kh = 0; idx_kh < kh; ++idx_kh) { + for (int idx_kw = 0; idx_kw < kw; ++idx_kw) { + for (int idx_dg = 0; idx_dg < dg; ++idx_dg) { + const int data_offset_h = + ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2; + const int data_offset_w = + ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2 + 1; + const int data_mask = (idx_dg * kh + idx_kh) * kw + idx_kw; + const float offset_h = offset_ptr[data_offset_h]; + const float offset_w = offset_ptr[data_offset_w]; + const float mask = + mask_ptr != nullptr ? mask_ptr[data_mask] : 1.0f; + const float h_in = hi_start + idx_kh * dh + offset_h; + const float w_in = wi_start + idx_kw * dw + offset_w; + if (h_in > -1 && w_in > -1 && h_in < hi && w_in < wi) { + for (int idx_ci = 0; idx_ci < ci / dg; ++idx_ci) { + const int ci_offset = idx_dg * ci / dg + idx_ci; + const int columns_offset = + (idx_kh * kw + idx_kw) * ci + ci_offset; + columns_ptr[columns_offset] = + bilinear(input_ptr, ci_offset, hi, wi, ci, h_in, w_in) * + mask; + } + } + } + } + } + } + } + } +} + +void DcnForwardExecutor::transpose(float *input, float *output, + const int dims[], const int dim_num, + const int permute[]) { + // cnnlTransposeDescriptor_t trans_desc; + int64_t dim_desc = dim_num; + std::vector permute_desc; + if (dim_desc > 8 || dim_desc <= 0) { + LOG(ERROR) << "dim_desc is " << dim_desc + << ", it shoule less than 8 and greater than 0"; + } + { std::vector().swap(permute_desc); } + for (int i = 0; i < dim_num; i++) { + permute_desc.push_back(permute[i]); + } + mluOpTensorDescriptor_t input_desc, output_desc; + input_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor, + mluOpDestroyTensorDescriptor); + output_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor, + mluOpDestroyTensorDescriptor); + int dims_trans[4]; + for (int i = 0; i < dim_num; ++i) { + dims_trans[i] = dims[permute[i]]; + } + + MLUOP_CHECK(mluOpSetTensorDescriptor(input_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, dim_num, dims)); + MLUOP_CHECK(mluOpSetTensorDescriptor(output_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, dim_num, dims_trans)); + + MLUOP_CHECK(mluOpTransposeCpu(dim_desc, permute_desc, input_desc, input, + output_desc, output)); + cpu_runtime_.deallocate(input_desc); + cpu_runtime_.deallocate(output_desc); +} + +static void BatchMatMul(const int &g, const int &m, const int &k, const int &n, + float *input_a, float *input_b, float *output, + const bool is_transa, const bool is_transb) { + const int batch_size = g; + + assert(batch_size >= 1); +#if USE_OPENBLAS + const CBLAS_ORDER Order = CblasRowMajor; + const CBLAS_TRANSPOSE TransA = is_transa ? CblasTrans : CblasNoTrans; + const CBLAS_TRANSPOSE TransB = is_transb ? CblasTrans : CblasNoTrans; + + int lda = is_transa ? m : k; + int ldb = is_transb ? k : n; + int ldc = n; + + float alpha = 1.0f; + float beta = 1.0f; +#else + auto matmul = [](float *lhs, float *rhs, float *output, bool is_trans_a, + bool is_trans_b, int M, int N, int K) { + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + // output[m * N + n] = 0.0f; + for (int k = 0; k < K; k++) { + int lhs_idx = m * K + k; + if (is_trans_a) lhs_idx = k * M + m; + int rhs_idx = k * N + n; + if (is_trans_b) rhs_idx = n * K + k; + output[m * N + n] += lhs[lhs_idx] * rhs[rhs_idx]; + } + } + } + }; +#endif + for (int i = 0; i < batch_size; ++i) { +#if USE_OPENBLAS + cblas_sgemm(Order, TransA, TransB, m, n, k, alpha, input_a + i * m * k, lda, + input_b + i * k * n, ldb, beta, output + i * m * n, ldc); +#else + matmul(input_a + i * m * k, input_b + i * k * n, output + i * m * n, + is_transa, is_transb, m, n, k); +#endif + } +} + +static void dealBias(float *cpu_output, float *cpu_bias, const int &N, + const int &ho, const int &wo, const int &co) { + for (int idx_n = 0; idx_n < N; ++idx_n) { + for (int idx_ho = 0; idx_ho < ho; ++idx_ho) { + for (int idx_wo = 0; idx_wo < wo; ++idx_wo) { + for (int idx_co = 0; idx_co < co; ++idx_co) { + cpu_output[((idx_n * ho + idx_ho) * wo + idx_wo) * co + idx_co] += + cpu_bias[idx_co]; + } + } + } + } +} + +void DcnForwardExecutor::computeDCNForwardCPU( + const int &dg, const int &g, const int &im2col_step, + const mluOpTensorDescriptor_t input_desc, const void *cpu_input, + const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset, + const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask, + const mluOpTensorDescriptor_t weight_desc, const void *cpu_weight, + const mluOpTensorDescriptor_t bias_desc, const void *cpu_bias, + const mluOpTensorDescriptor_t output_desc, const void *cpu_output, + float *buffer, int pad[], int stride[], int dilation[], + int64_t &theory_ops) { + const int N = input_desc->dims[0]; + const int hi = input_desc->dims[1]; + const int wi = input_desc->dims[2]; + const int ci = input_desc->dims[3]; + const int ho = offset_desc->dims[1]; + const int wo = offset_desc->dims[2]; + const int co = output_desc->dims[3]; + const int kh = weight_desc->dims[1]; + const int kw = weight_desc->dims[2]; + const int pt = pad[0]; + const int pb = pad[1]; + const int pl = pad[2]; + const int pr = pad[3]; + const int sh = stride[0]; + const int sw = stride[1]; + const int dh = dilation[0]; + const int dw = dilation[1]; + int coeff = getCoefficientOfLT2CT(); + if (g == 1) { + for (int i = 0; i < N / im2col_step; ++i) { + float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci; + float *offset_i = + (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2; + float *mask_i = + cpu_mask != nullptr + ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw + : nullptr; + float *output_i = (float *)cpu_output + i * im2col_step * ho * wo * co; + // 1.im2col + memset(buffer, 0, (im2col_step * ho * wo * kh * kw * ci) * sizeof(float)); + im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr, + sh, sw, dh, dw, input_i, offset_i, mask_i, buffer); + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci * + 15; // bilinear_count + mask + + // 2.BMM + float *input_a = buffer; + float *input_b = (float *)cpu_weight; + const int k = kh * kw * ci / g; + const int m = im2col_step * ho * wo; + const int n = co / g; + memset(output_i, 0, (im2col_step * ho * wo * co) * sizeof(float)); + BatchMatMul(g, m, k, n, input_a, input_b, (float *)output_i, false, true); + theory_ops += 2 * (int64_t)g * m * k * n / coeff; + } + } else { + // buffer:| columns_a | columns_b | output | + float *buffer_columns_a = buffer; + float *buffer_columns_b = + buffer_columns_a + im2col_step * ho * wo * kh * kw * ci; + float *buffer_output = + buffer_columns_b + im2col_step * ho * wo * kh * kw * ci; + + for (int i = 0; i < N / im2col_step; ++i) { + float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci; + float *offset_i = + (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2; + float *mask_i = + cpu_mask != nullptr + ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw + : nullptr; + float *output_i = (float *)cpu_output + i * im2col_step * ho * wo * co; + // 1.im2col + memset(buffer_columns_a, 0, + (im2col_step * ho * wo * kh * kw * ci) * sizeof(float)); + im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr, + sh, sw, dh, dw, input_i, offset_i, mask_i, buffer_columns_a); + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci * + 15; // bilinear_count + mask + + // 2.split columns + // [im2col_step*ho*wo*kh*kw,ci]->[g,im2col_step*ho*wo*kh*kw,ci/g] + int dims_1[3] = {im2col_step * ho * wo * kh * kw, g, ci / g}; + int permute_1[3] = {1, 0, 2}; + transpose(buffer_columns_a, buffer_columns_b, dims_1, 3, permute_1); + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci; + + // 3.BMM + float *input_a = buffer_columns_b; + float *input_b = (float *)cpu_weight; + const int k = kh * kw * ci / g; + const int m = im2col_step * ho * wo; + const int n = co / g; + memset(buffer_output, 0, (im2col_step * ho * wo * co) * sizeof(float)); + BatchMatMul(g, m, k, n, input_a, input_b, buffer_output, false, true); + theory_ops += 2 * (int64_t)g * m * k * n / coeff; + + // 4.transpose output [g,im2col_step*ho*wo, co/g]->[im2col_step*ho*wo, g, + // co/g] + int dims_2[3] = {g, im2col_step * ho * wo, co / g}; + int permute_2[3] = {1, 0, 2}; + transpose(buffer_output, (float *)output_i, dims_2, 3, permute_2); + theory_ops += (int64_t)im2col_step * ho * wo * co; + } + } + + if (cpu_bias) { + dealBias((float *)cpu_output, (float *)cpu_bias, N, ho, wo, co); + theory_ops += (int64_t)N * ho * wo * co; + } +} + +void DcnForwardExecutor::cpuCompute() { + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + cpu_input = cpu_fp32_input_[0]; + cpu_offset = cpu_fp32_input_[1]; + if (parser_->getInputNum() == 3) { + mask_desc = nullptr; + cpu_mask = nullptr; + weight_desc = tensor_desc_[2].tensor; + cpu_weight = cpu_fp32_input_[2]; + bias_desc = nullptr; + cpu_bias = nullptr; + output_desc = tensor_desc_[3].tensor; + cpu_output = cpu_fp32_output_[0]; + } else if (parser_->getInputNum() == 4) { + if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) { + mask_desc = tensor_desc_[2].tensor; + cpu_mask = cpu_fp32_input_[2]; + weight_desc = tensor_desc_[3].tensor; + cpu_weight = cpu_fp32_input_[3]; + bias_desc = nullptr; + cpu_bias = nullptr; + output_desc = tensor_desc_[4].tensor; + cpu_output = cpu_fp32_output_[0]; + } else { + mask_desc = nullptr; + cpu_mask = nullptr; + weight_desc = tensor_desc_[2].tensor; + cpu_weight = cpu_fp32_input_[2]; + bias_desc = tensor_desc_[3].tensor; + cpu_bias = cpu_fp32_input_[3]; + output_desc = tensor_desc_[4].tensor; + cpu_output = cpu_fp32_output_[0]; + } + } else { + mask_desc = tensor_desc_[2].tensor; + cpu_mask = cpu_fp32_input_[2]; + weight_desc = tensor_desc_[3].tensor; + cpu_weight = cpu_fp32_input_[3]; + bias_desc = tensor_desc_[4].tensor; + cpu_bias = cpu_fp32_input_[4]; + output_desc = tensor_desc_[5].tensor; + cpu_output = cpu_fp32_output_[0]; + } + + const int ho = offset_desc->dims[1]; + const int wo = offset_desc->dims[2]; + const int kh = weight_desc->dims[1]; + const int kw = weight_desc->dims[2]; + const int ci = input_desc->dims[3]; + const int co = output_desc->dims[3]; + + size_t cpu_buffer_size = 0; + if (g == 1) { + cpu_buffer_size = + (static_cast(im2col_step) * ho * wo * kh * kw * ci) * + sizeof(float); + } else { + cpu_buffer_size = (2lu * im2col_step * ho * wo * kh * kw * ci + + im2col_step * ho * wo * co) * + sizeof(float); + } + + float *buffer = nullptr; + buffer = (float *)cpu_runtime_.allocate(cpu_buffer_size); + if (buffer == nullptr) { + LOG(ERROR) << "dcn_forward: allocate buffer failed."; + } + theory_ops = 0; + computeDCNForwardCPU(dg, g, im2col_step, input_desc, cpu_input, offset_desc, + cpu_offset, mask_desc, cpu_mask, weight_desc, cpu_weight, + bias_desc, cpu_bias, output_desc, cpu_output, buffer, + pad, stride, dilation, theory_ops); + + cpu_runtime_.deallocate(buffer); +} + +int64_t DcnForwardExecutor::getTheoryOps() { + if (exe_config_->mlu_only) { + theory_ops = 0; + + input_desc = tensor_desc_[0].tensor; + offset_desc = tensor_desc_[1].tensor; + if (parser_->getInputNum() == 3) { + weight_desc = tensor_desc_[2].tensor; + bias_desc = nullptr; + output_desc = tensor_desc_[3].tensor; + } else if (parser_->getInputNum() == 4) { + if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) { + weight_desc = tensor_desc_[3].tensor; + bias_desc = nullptr; + output_desc = tensor_desc_[4].tensor; + } else { + weight_desc = tensor_desc_[2].tensor; + bias_desc = tensor_desc_[3].tensor; + output_desc = tensor_desc_[4].tensor; + } + } else { + weight_desc = tensor_desc_[3].tensor; + bias_desc = tensor_desc_[4].tensor; + output_desc = tensor_desc_[5].tensor; + } + + const int N = input_desc->dims[0]; + const int hi = input_desc->dims[1]; + const int wi = input_desc->dims[2]; + const int ci = input_desc->dims[3]; + const int ho = offset_desc->dims[1]; + const int wo = offset_desc->dims[2]; + const int co = output_desc->dims[3]; + const int kh = weight_desc->dims[1]; + const int kw = weight_desc->dims[2]; + int coeff = getCoefficientOfLT2CT(); + const int k = kh * kw * ci / g; + const int m = im2col_step * ho * wo; + const int n = co / g; + if (g == 1) { + for (int i = 0; i < N / im2col_step; ++i) { + // 1.im2col + // bilinear_count + mask + theory_ops += + (int64_t)im2col_step * ho * wo * kh * kw * (dg * 7 + ci * 7); + // 2.BMM + theory_ops += 2 * (int64_t)g * m * k * n / coeff; + } + } else { + for (int i = 0; i < N / im2col_step; ++i) { + // 1.im2col + // bilinear_count + mask + theory_ops += + (int64_t)im2col_step * ho * wo * kh * kw * (dg * 7 + ci * 7); + // 2.split columns + // [im2col_step*ho*wo*kh*kw,ci]->[g,im2col_step*ho*wo*kh*kw,ci/g] + theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci; + // 3.BMM + theory_ops += 2 * (int64_t)g * m * k * n / coeff; + // 4.transpose output [g,im2col_step*ho*wo, co/g]->[im2col_step*ho*wo, + // g, co/g] + theory_ops += (int64_t)im2col_step * ho * wo * co; + } + } + + if (bias_desc) { + theory_ops += (int64_t)N * ho * wo * co; + } + } + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.h b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.h new file mode 100755 index 000000000..ad0dda9e0 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.h @@ -0,0 +1,84 @@ +/************************************************************************* + * Copyright (C) [2019-2022] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef TEST_MLUOP_GTEST_SRC_ZOO_DCN_FORWARD_DCN_FORWARD_H_ +#define TEST_MLUOP_GTEST_SRC_ZOO_DCN_FORWARD_DCN_FORWARD_H_ + +#include +#include "executor.h" + +namespace mluoptest { + +class DcnForwardExecutor : public Executor { + public: + DcnForwardExecutor() {} + ~DcnForwardExecutor() {} + + void workspaceMalloc(); + void workspaceFree(); + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; + + private: + int getCoefficientOfLT2CT(); + void transpose(float *input, float *output, const int dims[], + const int dim_num, const int permute[]); + void computeDCNForwardCPU( + const int &dg, const int &g, const int &im2col_step, + const mluOpTensorDescriptor_t input_desc, const void *cpu_input, + const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset, + const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask, + const mluOpTensorDescriptor_t weight_desc, const void *cpu_weight, + const mluOpTensorDescriptor_t bias_desc, const void *cpu_bias, + const mluOpTensorDescriptor_t output_desc, const void *cpu_output, + float *buffer, int pad[], int stride[], int dilation[], + int64_t &theory_ops); + mluOpDataType_t input_onchip_dtype; + mluOpDataType_t weight_onchip_dtype; + + mluOpTensorDescriptor_t input_desc; + mluOpTensorDescriptor_t offset_desc; + mluOpTensorDescriptor_t mask_desc = nullptr; // optional + mluOpTensorDescriptor_t output_desc; + mluOpTensorDescriptor_t weight_desc; + mluOpTensorDescriptor_t bias_desc = nullptr; // optional + + int dimnb; + int pad[4]; + int stride[2]; + int dilation[2]; + int dg; + int g; + int im2col_step; + + void *input = nullptr; + void *offset = nullptr; + void *mask = nullptr; + void *output = nullptr; + void *weight = nullptr; + void *bias = nullptr; + + void *cpu_input = nullptr; + void *cpu_offset = nullptr; + void *cpu_mask = nullptr; + void *cpu_output = nullptr; + void *cpu_weight = nullptr; + void *cpu_bias = nullptr; + + void *workspace = nullptr; + size_t workspace_size = 0; + int64_t theory_ops = 0; +}; + +} // namespace mluoptest +#endif // TEST_MLUOP_GTEST_SRC_ZOO_DCN_FORWARD_DCN_FORWARD_H_ diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/test_case/case_hi_16.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/test_case/case_hi_16.prototxt new file mode 100755 index 000000000..12765af0f --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/test_case/case_hi_16.prototxt @@ -0,0 +1,117 @@ +op_name: "dcn_forward" +op_type: DCN_FORWARD +input { + id: "input" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 300 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 1 + lower_bound: -1 + distribution: UNIFORM + } +} +input { + id: "offset" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 36 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 1 + lower_bound: -1 + distribution: UNIFORM + } +} +input { + id: "mask" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 18 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 1 + lower_bound: 0 + distribution: UNIFORM + } +} +input { + id: "weight" + shape: { + dims: 300 + dims: 3 + dims: 3 + dims: 100 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 10 + lower_bound: -1 + distribution: UNIFORM + } +} +input { + id: "bias" + shape: { + dims: 300 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 5 + lower_bound: -6 + distribution: UNIFORM + } +} +output { + id: "output" + shape: { + dims: 1 + dims: 16 + dims: 16 + dims: 300 + } + layout: LAYOUT_NHWC + dtype: DTYPE_FLOAT +} +dcn_param: { + dimnb: 4 + pad: 1 + pad: 1 + pad: 1 + pad: 1 + stride: 1 + stride: 1 + dilation: 1 + dilation: 1 + deformable_group: 2 + conv_group: 3 + im2col_step: 1 + compute_type: 2 +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +}