diff --git a/docs/MLU-OPS-OpList.md b/docs/MLU-OPS-OpList.md
old mode 100644
new mode 100755
index ca0807bf3..bed2a2bf7
--- a/docs/MLU-OPS-OpList.md
+++ b/docs/MLU-OPS-OpList.md
@@ -101,4 +101,6 @@ MLU Binary Op算子结构：　
 | voxel_pooling_forward                  | √             |               |
 | voxelization                           | √             |               |
 | yolo_box                               | √             |               |
-| dcn_backward_data                      |               | √             |
\ No newline at end of file
+| dcn_backward_data                      |               | √             |
+| dcn_forward                            |               | √             |
+| dcn_backward_weight                    |               | √             |
\ No newline at end of file
diff --git a/docs/user_guide/9_operators/index.rst b/docs/user_guide/9_operators/index.rst
old mode 100644
new mode 100755
index 7f5540821..ec6e56dcc
--- a/docs/user_guide/9_operators/index.rst
+++ b/docs/user_guide/9_operators/index.rst
@@ -90,6 +90,24 @@ mluOpCopy
 -----------------------------
 该算子主要在语音网络中使用，对数据块进行 device 到 device 的拷贝。
 
+.. _dcn_backward_data:
+
+mluOpDCNBackwardData
+---------------------------------
+该算子用于求取可变形卷积算子关于input、offset、mask的反向梯度。
+
+.. _dcn_backward_weight:
+
+mluOpDCNBackwardWeight
+-----------------------------
+求取可变形卷积算子关于filter和bias的反向梯度。
+
+.. _dcn_forward:
+
+mluOpDCNForward
+-----------------------------
+可变形卷积。通过额外的offset和mask来增强滤波器对空间的几何表达能力，并且该卷积可以任意替代之前卷及网络里面的任意常规卷积层。
+
 .. _deform_roi_pool_backward:
 
 mluOpDeformRoiPoolBackward
@@ -1008,9 +1026,3 @@ mluOpConcat
  - ``N`` 为每个input和output的维度数。
  - ``sum(axis_1, ..., axis_m)`` 表示对待拼接维度求和，output的拼接维度大小为所有input拼接维度的总和。
  - 除拼接维度外，其余维度的大小需要相等。
-
-.. _dcn_backward_data:
-
-mluOpDCNBackwardData
----------------------------------
-该算子用于求取可变形卷积算子关于input、offset、mask的反向梯度。
diff --git a/kernel_depends.toml b/kernel_depends.toml
old mode 100644
new mode 100755
index f6d76f6b1..7dc5a0441
--- a/kernel_depends.toml
+++ b/kernel_depends.toml
@@ -41,3 +41,5 @@ deform_roi_pool_forward = ["deform_roi_pool"]
 deform_roi_pool_backward = ["deform_roi_pool"]
 carafe_forward = ["carafe"]
 carafe_backward = ["carafe"]
+dcn_backward_weight = ["dcn_forward"]
+dcn_backward_data = ["dcn_forward"]
diff --git a/kernels/dcn_backward_data/dcn_backward_data.cpp b/kernels/dcn_backward_data/dcn_backward_data.cpp
old mode 100644
new mode 100755
index 88ea92d65..aa20bb224
--- a/kernels/dcn_backward_data/dcn_backward_data.cpp
+++ b/kernels/dcn_backward_data/dcn_backward_data.cpp
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
+ * Copyright (C) [2024] by Cambricon, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -26,43 +26,7 @@
 
 #include "kernels/utils/cnnl_helper.h"
 
-#define DCNBPDATA_API "mluOpDcnBackwardData"
-
-mluOpStatus_t MLUOP_WIN_API
-mluOpCreateDCNDescriptor(mluOpDCNDescriptor_t *dcn_desc) {
-  PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL);
-  CHECK_FUNC_RETURN(cnnlCreateDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS,
-                    "[mluOpCreateDCNDescriptor] Internal error accured in "
-                    "cnnlCreateDCNDescriptor.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API
-mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc) {
-  PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL);
-  CHECK_FUNC_RETURN(cnnlDestroyDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS,
-                    "[mluOpDestroyDCNDescriptor] Internal error accured in "
-                    "cnnlDestroyDCNDescriptor.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpSetDCNDescriptor(
-    mluOpDCNDescriptor_t dcn_desc, int dimNb, const int pad[],
-    const int stride[], const int dilation[], int deformable_group,
-    int conv_group, int im2col_step, const mluOpDataType_t compute_type) {
-  PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL);
-  CHECK_FUNC_RETURN(
-      cnnlSetDCNDescriptor(dcn_desc, dimNb, pad, stride, dilation,
-                           deformable_group, conv_group, im2col_step,
-                           cnnlDataType_t(compute_type)),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSetDCNDescriptor] Internal error accured in "
-      "cnnlSetDCNDescriptor.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
+#define DCNBPDATA_API "mluOpDCNBackwardData"
 
 mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBakcwardDataWorkspaceSize(
     mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
diff --git a/kernels/dcn_backward_weight/dcn_backward_weight.cpp b/kernels/dcn_backward_weight/dcn_backward_weight.cpp
new file mode 100644
index 000000000..0f9bcb094
--- /dev/null
+++ b/kernels/dcn_backward_weight/dcn_backward_weight.cpp
@@ -0,0 +1,109 @@
+/*************************************************************************
+ * Copyright (C) [2024] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <limits.h>
+#include <math.h>
+#include <vector>
+
+#include "kernels/utils/cnnl_helper.h"
+
+#define DCNBACKWARDWEIGHT_API "mluOpDCNBackwardWeight"
+
+mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBackwardWeightWorkspaceSize(
+    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
+    const mluOpTensorDescriptor_t input_desc,
+    const mluOpTensorDescriptor_t offset_desc,
+    const mluOpTensorDescriptor_t mask_desc,
+    const mluOpTensorDescriptor_t grad_output_desc,
+    const mluOpTensorDescriptor_t grad_filter_desc,
+    const mluOpTensorDescriptor_t grad_bias_desc, size_t *size) {
+  PARAM_CHECK("mluOpDCNBackwardWeight", handle != NULL);
+  PARAM_CHECK("mluOpDCNBackwardWeight", dcn_desc != NULL);
+  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, _handle);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, _input_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, _offset_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, _mask_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc,
+                                               _grad_output_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc,
+                                               _grad_filter_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc, _grad_bias_desc);
+  CHECK_FUNC_RETURN(
+      cnnlGetDCNBackwardWeightWorkspaceSize(
+          _handle, dcn_desc, _input_desc, _offset_desc, _mask_desc,
+          _grad_output_desc, _grad_filter_desc, _grad_bias_desc, size),
+      CNNL_STATUS_SUCCESS,
+      "[mluOpDCNBackwardWeight] Internal error accured in "
+      "mluOpGetDCNBackwardWeightWorkspaceSize.",  // NOLINT
+      MLUOP_STATUS_INTERNAL_ERROR);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(_input_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(_offset_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(_mask_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_output_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_filter_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_bias_desc);
+  DESTROY_CNNL_HANDLE(_handle);
+  return MLUOP_STATUS_SUCCESS;
+}
+
+mluOpStatus_t MLUOP_WIN_API mluOpDCNBackwardWeight(
+    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
+    const mluOpTensorDescriptor_t input_desc, const void *input,
+    const mluOpTensorDescriptor_t offset_desc, const void *offset,
+    const mluOpTensorDescriptor_t mask_desc, const void *mask,
+    const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output,
+    void *workspace, const size_t workspace_size,
+    const mluOpTensorDescriptor_t grad_filter_desc, void *grad_filter,
+    const mluOpTensorDescriptor_t grad_bias_desc, void *grad_bias) {
+  PARAM_CHECK(DCNBACKWARDWEIGHT_API, handle != NULL);
+  if (workspace_size > 0) {
+    PARAM_CHECK(DCNBACKWARDWEIGHT_API, workspace != NULL);
+  }
+  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc,
+                                               cnnl_grad_output_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc,
+                                               cnnl_grad_filter_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc,
+                                               cnnl_grad_bias_desc);
+  CHECK_FUNC_RETURN(
+      cnnlDCNBackwardWeight(cnnl_handle, dcn_desc, cnnl_input_desc, input,
+                            cnnl_offset_desc, offset, cnnl_mask_desc, mask,
+                            cnnl_grad_output_desc, grad_output, workspace,
+                            workspace_size, cnnl_grad_filter_desc, grad_filter,
+                            cnnl_grad_bias_desc, grad_bias),
+      CNNL_STATUS_SUCCESS,
+      "[mluOpDcnBackwardWeight] Internal error accured in "
+      "mluOpDcnBackwardWeight.",
+      MLUOP_STATUS_INTERNAL_ERROR);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_filter_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_bias_desc);
+  DESTROY_CNNL_HANDLE(cnnl_handle);
+  return MLUOP_STATUS_SUCCESS;
+}
diff --git a/kernels/dcn_forward/dcn_common.h b/kernels/dcn_forward/dcn_common.h
new file mode 100644
index 000000000..59acab57a
--- /dev/null
+++ b/kernels/dcn_forward/dcn_common.h
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (C) [2022] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef KERNELS_DCN_COMMON_DCN_COMMON_H
+#define KERNELS_DCN_COMMON_DCN_COMMON_H
+#include <limits.h>
+#include <math.h>
+#include <vector>
+
+#include "kernels/utils/cnnl_helper.h"
+
+#define DCN_API "mluOpDCN"
+
+mluOpStatus_t MLUOP_WIN_API
+mluOpCreateDCNDescriptor(mluOpDCNDescriptor_t *dcn_desc) {
+  PARAM_CHECK(DCN_API, dcn_desc != NULL);
+  CHECK_FUNC_RETURN(cnnlCreateDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS,
+                    "[mluOpDcn] Internal error accured in "
+                    "mluOpCreateDCNDescriptor.",
+                    MLUOP_STATUS_INTERNAL_ERROR);
+  return MLUOP_STATUS_SUCCESS;
+}
+
+mluOpStatus_t MLUOP_WIN_API
+mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc) {
+  PARAM_CHECK(DCN_API, dcn_desc != NULL);
+  CHECK_FUNC_RETURN(cnnlDestroyDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS,
+                    "[mluOpDcn] Internal error accured in "
+                    "mluOpDestroyDCNDescriptor.",
+                    MLUOP_STATUS_INTERNAL_ERROR);
+  return MLUOP_STATUS_SUCCESS;
+}
+
+mluOpStatus_t MLUOP_WIN_API mluOpSetDCNDescriptor(
+    mluOpDCNDescriptor_t dcn_desc, int dimNb, const int pad[],
+    const int stride[], const int dilation[], int deformable_group,
+    int conv_group, int im2col_step, const mluOpDataType_t compute_type) {
+  PARAM_CHECK(DCN_API, dcn_desc != NULL);
+  CHECK_FUNC_RETURN(
+      cnnlSetDCNDescriptor(dcn_desc, dimNb, pad, stride, dilation,
+                           deformable_group, conv_group, im2col_step,
+                           cnnlDataType_t(compute_type)),
+      CNNL_STATUS_SUCCESS,
+      "[mluOpDcn] Internal error accured in "
+      "mluOpSetDCNDescriptor.",
+      MLUOP_STATUS_INTERNAL_ERROR);
+  return MLUOP_STATUS_SUCCESS;
+}
+
+#endif  // KERNELS_DCN_COMMON_DCN_COMMON_H
diff --git a/kernels/dcn_forward/dcn_forward.cpp b/kernels/dcn_forward/dcn_forward.cpp
new file mode 100644
index 000000000..c746f8971
--- /dev/null
+++ b/kernels/dcn_forward/dcn_forward.cpp
@@ -0,0 +1,103 @@
+/*************************************************************************
+ * Copyright (C) [2024] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "kernels/dcn_forward/dcn_common.h"
+
+#define DCNFORWARD_API "mluOpDCNForward"
+
+mluOpStatus_t MLUOP_WIN_API mluOpGetDCNForwardWorkspaceSize(
+    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
+    const mluOpTensorDescriptor_t input_desc,
+    const mluOpTensorDescriptor_t offset_desc,
+    const mluOpTensorDescriptor_t mask_desc,
+    const mluOpTensorDescriptor_t filter_desc,
+    const mluOpTensorDescriptor_t bias_desc,
+    const mluOpTensorDescriptor_t output_desc, size_t *size) {
+  PARAM_CHECK("mluOpDCNForward", handle != NULL);
+  PARAM_CHECK("mluOpDCNForward", dcn_desc != NULL);
+  PARAM_CHECK("mluOpDCNForward", input_desc != NULL);
+  PARAM_CHECK("mluOpDCNForward", offset_desc != NULL);
+  PARAM_CHECK("mluOpDCNForward", filter_desc != NULL);
+  PARAM_CHECK("mluOpDCNForward", output_desc != NULL);
+  PARAM_CHECK("mluOpDCNForward", size != NULL);
+  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc);
+  CHECK_FUNC_RETURN(cnnlGetDCNForwardWorkspaceSize(
+                        cnnl_handle, dcn_desc, cnnl_input_desc,
+                        cnnl_offset_desc, cnnl_mask_desc, cnnl_filter_desc,
+                        cnnl_bias_desc, cnnl_output_desc, size),
+                    CNNL_STATUS_SUCCESS,
+                    "[mluOpDCNForward] Internal error accured in "
+                    "mluOpGetDCNForwardWorkspaceSize.",  // NOLINT
+                    MLUOP_STATUS_INTERNAL_ERROR);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
+  DESTROY_CNNL_HANDLE(cnnl_handle);
+  return MLUOP_STATUS_SUCCESS;
+}
+
+mluOpStatus_t MLUOP_WIN_API
+mluOpDCNForward(mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
+                const mluOpTensorDescriptor_t input_desc, const void *input,
+                const mluOpTensorDescriptor_t offset_desc, const void *offset,
+                const mluOpTensorDescriptor_t mask_desc, const void *mask,
+                const mluOpTensorDescriptor_t filter_desc, const void *filter,
+                const mluOpTensorDescriptor_t bias_desc, const void *bias,
+                void *workspace, size_t workspace_size,
+                const mluOpTensorDescriptor_t output_desc, void *output) {
+  PARAM_CHECK(DCNFORWARD_API, handle != NULL);
+  if (workspace_size > 0) {
+    PARAM_CHECK(DCNFORWARD_API, workspace != NULL);
+  }
+  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc);
+  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc);
+  CHECK_FUNC_RETURN(
+      cnnlDCNForward(cnnl_handle, dcn_desc, cnnl_input_desc, input,
+                     cnnl_offset_desc, offset, cnnl_mask_desc, mask,
+                     cnnl_filter_desc, filter, cnnl_bias_desc, bias, workspace,
+                     workspace_size, cnnl_output_desc, output),
+      CNNL_STATUS_SUCCESS,
+      "[mluOpDcnForward] Internal error accured in mluOpDcnForward.",
+      MLUOP_STATUS_INTERNAL_ERROR);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc);
+  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
+  DESTROY_CNNL_HANDLE(cnnl_handle);
+  return MLUOP_STATUS_SUCCESS;
+}
diff --git a/mlu_op.h b/mlu_op.h
old mode 100644
new mode 100755
index 0a1795162..f530af5e3
--- a/mlu_op.h
+++ b/mlu_op.h
@@ -12070,7 +12070,7 @@ mluOpSetDCNDescriptor(mluOpDCNDescriptor_t dcn_desc,
  * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_EXECUTION_FAILED
  *
  * @par Note
- * - Call this function after calling the ::mluOpDCNBackwardData,
+ * - Call this function after calling the ::mluOpDCNBackwardData, ::mluOpDCNForward,
  *   or ::mluOpDCNBackwardWeight. Otherwise, \p MLUOP_STATUS_BAD_PARAM is returned.
  * - It is necessary to call this function destroy the deformable convolution descriptor.
  *   to avoid the memory leaks.
@@ -12087,6 +12087,470 @@ mluOpSetDCNDescriptor(mluOpDCNDescriptor_t dcn_desc,
 mluOpStatus_t MLUOP_WIN_API
 mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc);
 
+// Group:DCN
+/*!
+ * @brief Returns in \p workspace_size the size of the MLU memory that is used as an extra
+ *        workspace to optimize the deformable convolution forward operation.
+ *
+ * The size of the extra workspace is determined by the deformable convolution
+ * forward operation, including the deformable convolution descriptor \p dcn_desc,
+ * input tensor descriptor \p input_desc, offset tensor
+ * descriptor \p offset_desc, mask tensor descriptor \p mask_desc, filter tensor descriptor
+ * \p filter_desc, bias tensor descriptor \p bias_desc, and output tensor descriptor \p output_desc.
+ * For more information about the workspace, see "Cambricon MLUOP User Guide."
+ *
+ * @param[in] handle
+ *   Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the
+ *   deformable convolution operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] dcn_desc
+ *   Input. The descriptor of the deformable convolution operation. For detailed information, see
+ *   ::mluOpDCNDescriptor_t.
+ * @param[in] input_desc
+ *   Input. The descriptor of the input tensor. For detailed information, see
+ *   ::mluOpTensorDescriptor_t.
+ * @param[in] offset_desc
+ *   Input. The descriptor of the offset tensor. For detailed information, see
+ *   ::mluOpTensorDescriptor_t.
+ * @param[in] mask_desc
+ *   Input. The descriptor of the mask tensor. Set this parameter to NULL if mask is not needed. For detailed
+ *   information, see ::mluOpTensorDescriptor_t.
+ * @param[in] filter_desc
+ *   Input. The descriptor of the filter tensor used as a filter in the deformable convolution
+ *   operation. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] bias_desc
+ *   Input. The descriptor of the bias tensor. Set this parameter to NULL if bias is not needed.
+ *   For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] output_desc
+ *   Input. The descriptor of the output tensor.
+ *   For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[out] workspace_size
+ *   Output. Pointer to the returned size of the extra workspace in bytes that is used in the
+ *   deformable convolution forward operation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par API Dependency
+ * - You must call the ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor functions
+ *   to create and set the tensor descriptors \p input_desc, \p offset_desc, \p mask_desc (optional),
+ *   \p filter_desc, and \p bias_desc (optional) before calling this function.
+ * - The allocated extra workspace must be passed to the ::mluOpDCNForward function to perform
+ *   the deformable convolution forward operation.
+ *
+ * @par Note
+ * - None.
+ *
+ * @par Requirements
+ * - None.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - None.
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpGetDCNForwardWorkspaceSize(mluOpHandle_t handle,
+                                const mluOpDCNDescriptor_t dcn_desc,
+                                const mluOpTensorDescriptor_t input_desc,
+                                const mluOpTensorDescriptor_t offset_desc,
+                                const mluOpTensorDescriptor_t mask_desc,
+                                const mluOpTensorDescriptor_t filter_desc,
+                                const mluOpTensorDescriptor_t bias_desc,
+                                const mluOpTensorDescriptor_t output_desc,
+                                size_t *workspace_size);
+
+// Group:DCN
+/*!
+ * @brief Performs a 2D deformable convolution forward operation. Compared with the standard
+ *        convolution, the deformable convolution introduces 2D offsets and masks to make
+ *        the convolution adapt to the geometric variation of objects.
+ *        Offsets act on the regular grid sampling locations, which enables a free form
+ *        deformation of the sampling grid. The mask is a modulation mechanism that improves the ability
+ *        to focus on pertinent image regions. Both offsets and masks are
+ *        learnable parameters obtained from additional convolutional layers.
+ *
+ *
+ * @param[in] handle
+ *   Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues. For
+ *   detailed information, see ::mluOpHandle_t.
+ * @param[in] dcn_desc
+ *   Input. The descriptor of the deformable convolution. For detailed information, see
+ *   ::mluOpDCNDescriptor_t.
+ * @param[in] input_desc
+ *   Input. The descriptor of the input tensor. For detailed information,
+ *   see ::mluOpTensorDescriptor_t.
+ * @param[in] input
+ *   Input. Pointer to the MLU memory that stores the input tensor.
+ * @param[in] offset_desc
+ *   Input. The descriptor of the offset tensor to be applied to each position in the convolution kernel.
+    The shape of the offset should be (batch, out_height, out_width, 2 * deformable_group *
+    filter_height, filter_width). For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] offset
+ *   Input. Pointer to the MLU memory that stores the offset tensor.
+ * @param[in] mask_desc
+ *   Input. The descriptor of the scaling factor to be applied to each position in the convolution
+ *   kernel. The shape of the mask must be (batch, out_height, out_width,
+    deformable_group  filter_height * filter_width). Set this parameter to NULL when
+ *  the mask is not requested. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] mask
+ *   Input. Pointer to the MLU memory that stores the mask tensor. Set this parameter to NULL
+ *   when mask is not requested.
+ * @param[in] filter_desc
+ *   Input. The descriptor of the filter tensor used as a filter in the deformable convolution
+ *   operation. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] filter
+ *   Input. Pointer to the MLU memory that stores the filter tensor.
+ * @param[in] bias_desc
+ *   Input. The descriptor of the bias tensor. Set this parameter to NULL when bias is not
+ *   requested. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] bias
+ *   Input. Pointer to the MLU memory that stores the bias tensor. Set this parameter to NULL when bias is not
+ *   requested.
+ * @param[in] workspace
+ *   Input. Pointer to the MLU memory that is used as an extra workspace for the
+ *   deformable convolution operation. For more information about workspace, see
+ *   "Cambricon MLUOP User Guide".
+ * @param[in] workspace_size
+ *   Input. The size of the extra workspace in bytes needed for the deformable
+ *   convolution operation. You can get the size of the workspace with the
+ *   ::mluOpGetDCNForwardWorkspaceSize function.
+ * @param[in] output_desc
+ *   Input. The descriptor of the output tensor. The shape of output is the same with the
+ *   shape of output in the convolution.
+ * @param[out] output
+ *   Output. Pointer to the MLU memory that stores the output tensor.
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM,
+ *   ::MLUOP_STATUS_NOT_SUPPORTED, ::MLUOP_STATUS_NUMERICAL_OVERFLOW
+ * @par Formula
+ * - See "Deformable Convolution Operator" section in "Cambricon MLUOP User Guide" for details.
+ *
+ * @par Data Type
+ * - The off-chip data type of \p input, \p offset, \p mask, \p filter, \p bias, and \p output must be the same.
+ * - The supported off-chip data types of the input tensor and output tensor are as follows:
+ *   - input, offset, mask, filter, bias, output: half, float.
+ * - This function supports any combinations of the following on-chip data types for input tensor
+ *   \p input and \p filter on MLU200 series and CE3226.
+ *   - \p input onchip data type: int16, int31.
+ *   - \p filter onchip data type: int16, int31.
+ * - \p input offchip data type can be combined with any supported onchip data types.
+ * - \p filter offchip data type can be combined with any supported onchip data types.
+ * - This function also supports floating-point computation on MLU300 series or above.
+ *   To perform floating-point computation, the onchip data type of \p input and \p filter
+ *   should be \p MLUOP_DTYPE_INVALID or the same as the corresponding offchip data type.
+ *
+ * @par Data Layout
+ * - The supported data layouts of the input tensor, filter, bias tensor, and output tensor are
+ *   as follows:
+ *   - input, offset, mask, filter, output: \p MLUOP_LAYOUT_NHWC.
+ *   - bias: \p MLUOP_LAYOUT_ARRAY
+ *
+ * @par Scale Limitation
+ * - The input, offset, mask, filter, bias, output and the deformable convolution descriptor
+ *   (including pad, stride, dilation, deformable_group, conv_group, im2col_step) must meet the
+ *   following requirements:
+ *   - input tensor: \p batch > 0, \p height > 0, \p width > 0, \p channel > 0
+ *   - offset tensor: \p batch should be equal to the batch size of input tensor, \p height and \p width
+ *     should be equal to the height and width of output tensor accordingly. \p channel should be equal to
+      deformable_group  filter_height  filter_width  2.
+ *   - mask tensor: When mask is needed, \p batch should be equal to the batch size of input tensor,
+ *     \p height and \p width should be equal to the height and width of output tensor accordingly.
+      \p channel should be equal to deformable_group  filter_height * filter_width.
+    - The value of (im2col_step  out_height  out_filter  filter_h  filter_w  input_channel)
+ *     should be less than or equal to the INT_MAX defined in limits.h.
+ * @par API Dependency
+ * - Before calling this function to implement deformable convolution, you need to prepare
+ *   all the parameters passed to this function. See each parameter description
+ *   for details.
+ *
+ * @par Performance Optimization
+ * - To achieve better performance, set the im2col_step equal to the batch
+ *   size of the input tensor.
+ *
+ * @par Note
+ * - The alignment of \p input, \p offset, \p mask, \p filter, \p bias, \p output,
+ *   should be contiguous in the MLU memory.
+ *
+ * @par Requirements
+ * - None.
+ *
+ * @par Example
+ * - The example of the deformable convolution forward operation is as follows:
+     @verbatim
+
+     input tensor by 1  3  3 * 2 --> input:
+     [[[[0.7944, 0.4922], [0.2008, 0.2081], [0.9998, 0.3053]],
+       [[0.1815, 0.9210], [0.8463, 0.1819], [0.9159, 0.4917]],
+       [[0.6668, 0.2843], [0.8364, 0.2765], [0.7150, 0.6780]]]]
+     offset tensor by 1  3  3 * 2 --> offset:
+     [[[[-0.6317, -1.4928], [-0.0696,  1.1910], [ 0.8778,  0.5145]],
+       [[-0.9248, -0.9889], [ 0.6157,  0.2157], [-1.1540, -0.1283]],
+       [[-0.5704,  1.0237], [ 0.7956,  1.1203], [-0.0129, -0.2686]]]]
+     mask tensor by 1  3  3 * 1 --> mask:
+     [[[[ 0.4581], [-1.1605], [ 0.5951]],
+       [[ 0.4313], [ 0.1070], [ 0.0225]],
+       [[ 0.7484], [ 0.6262], [ 1.1908]]]]
+     filter tensor by 2  1  1 * 2 --> filter:
+     [[[[0.8928, 0.9682]]], [[[0.9301, 0.6817]]]]
+     bias tensor by 2 --> bias:
+     [0.4356, 0.0840]
+
+     param:
+       pad: (0, 0, 0, 0), stride: (1, 1), dilation: (1, 1)
+
+     output tensor by 1  3  3 * 2 --> output:
+     [[[[ 0.4356,  0.0840], [-0.6024, -0.9101], [ 0.8056,  0.4252]],
+       [[ 0.4412,  0.0890], [ 0.5478,  0.1898], [ 0.4562,  0.1037]],
+       [[ 1.1652,  0.7876], [ 0.5814,  0.2109], [ 1.8874,  1.3752]]]]
+     @endverbatim
+ *
+ * @par Reference
+ * - https://github.com/msracver/Deformable-ConvNets
+ * - Deformable Convolutional Networks, Jifeng Dai, et al., 2017.
+ * - Deformable ConvNets v2: More Deformable, Better Results, Xizhou Zhu, et al., 2018.
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpDCNForward(mluOpHandle_t handle,
+                const mluOpDCNDescriptor_t dcn_desc,
+                const mluOpTensorDescriptor_t input_desc,
+                const void *input,
+                const mluOpTensorDescriptor_t offset_desc,
+                const void *offset,
+                const mluOpTensorDescriptor_t mask_desc,
+                const void *mask,
+                const mluOpTensorDescriptor_t filter_desc,
+                const void *filter,
+                const mluOpTensorDescriptor_t bias_desc,
+                const void *bias,
+                void *workspace,
+                size_t workspace_size,
+                const mluOpTensorDescriptor_t output_desc,
+                void *output);
+
+// Group:DCN
+/*!
+ * @brief Returns in \p workspace_size the size of the MLU memory that is used as an extra
+ *        workspace to optimize the deformable convolution backward filter operation.
+ *
+ * The size of the extra workspace is determined by the deformable convolution
+ * backward filter operation, including the deformable convolution descriptor \p dcn_desc,
+ * input tensor descriptor \p input_desc, offset tensor
+ * descriptor \p offset_desc, mask tensor descriptor \p mask_desc, gradient with respect to
+ * the output tensor \p grad_output_desc, the gradient with respect to the filter tensor
+ * \p grad_filter_desc, and the gradient with respect to the bias tensor \p grad_bias_desc.
+ * For more information about the workspace, see "Cambricon MLUOP User Guide."
+ *
+ * @param[in] handle
+ *   Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the
+ *   deformable convolution operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] dcn_desc
+ *   Input. The descriptor of the deformable convolution operation. For detailed information, see
+ *   ::mluOpDCNDescriptor_t.
+ * @param[in] input_desc
+ *   Input. The descriptor of the input tensor. For detailed information, see
+ *   ::mluOpTensorDescriptor_t.
+ * @param[in] offset_desc
+ *   Input. The descriptor of the offset tensor. For detailed information, see
+ *   ::mluOpTensorDescriptor_t.
+ * @param[in] mask_desc
+ *   Input. The descriptor of the mask tensor. Set this parameter to NULL if mask is not needed. For detailed
+ *   information, see ::mluOpTensorDescriptor_t.
+ * @param[in] grad_output_desc
+ *   Input. The descriptor of the gradient with respect to the output tensor.
+ *   For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] grad_filter_desc
+ *   Input. The descriptor of the gradient with respect to the filter tensor.
+ *   For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] grad_bias_desc
+ *   Input. The descriptor of the gradient with respect to the bias tensor.
+ *   Set this parameter to NULL if the gradient with respect to bias is not needed.
+ *   For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[out] workspace_size
+ *   Output. Pointer to the returned size of the extra workspace in bytes that is used in the
+ *   deformable convolution backward filter operation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par API Dependency
+ * - You must call the ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor functions
+ *   to create and set the tensor descriptors \p input, \p offset, \p mask (optional),
+ *   \p grad_output, \p grad_filter, and \p grad_bias (optional) before calling this
+ *   function.
+ * - The allocated extra workspace must be passed to the ::mluOpDCNBackwardWeight function to
+ *   perform the deformable convolution backward filter operation.
+ *
+ * @par Note
+ * - None.
+ *
+ * @par Requirements
+ * - None.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - None.
+ */
+
+mluOpStatus_t MLUOP_WIN_API
+mluOpGetDCNBackwardWeightWorkspaceSize(mluOpHandle_t handle,
+                                       const mluOpDCNDescriptor_t dcn_desc,
+                                       const mluOpTensorDescriptor_t input_desc,
+                                       const mluOpTensorDescriptor_t offset_desc,
+                                       const mluOpTensorDescriptor_t mask_desc,
+                                       const mluOpTensorDescriptor_t grad_output_desc,
+                                       const mluOpTensorDescriptor_t grad_filter_desc,
+                                       const mluOpTensorDescriptor_t grad_bias_desc,
+                                       size_t *workspace_size);
+
+// Group:DCN
+/*!
+ * @brief Performs the back-propagation of a deformable convolution operation to compute
+ *        the gradient with respect to filter \p grad_filter and bias \p grad_bias
+ *        based on the gradient of response \p grad_output.
+ *
+ * This function needs extra MLU memory as the workspace to improve the performance.
+ * You can get the size of the workspace \p workspace_size with the
+ * ::mluOpGetDCNBackwardWeightWorkspaceSize function.
+ *
+ * @param[in] handle
+ *   Input. Handle to a Cambricon MLUOP context that is used to manage MLU devices and
+ *   queues in the deformable convolution backward filter operation. For detailed information,
+ *   see ::mluOpHandle_t.
+ * @param[in] dcn_desc
+ *   Input. The descriptor of the deformable convolution operation. For detailed information,
+ *   see ::mluOpDCNDescriptor_t.
+ * @param[in] input_desc
+ *   Input. The descriptor of the input tensor. For detailed information,
+ *   see ::mluOpTensorDescriptor_t.
+ * @param[in] input
+ *   Input. Pointer to the MLU memory that stores the input tensor.
+ * @param[in] offset_desc
+ *   Input. The descriptor of the offset to be applied to each position in the convolution kernel.
+    The shape of offset should be (batch, out_height, out_width, 2  deformable_group *
+    weight_height  filter_width). For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] offset
+ *   Input. Pointer to the MLU memory that stores the offset tensor.
+ * @param[in] mask_desc
+ *   Input. The descriptor of the scaling factor to be applied to each position in the convolution
+ *   kernel. The shape of the mask must be (batch, out_height, out_width,
+    deformable_group  filter_height * filter_width). Set this parameter to NULL when
+ *   mask is not requested. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] mask
+ *   Input. Pointer to the MLU memory that stores the mask tensor. Set this parameter to NULL when mask is not
+ *   requested.
+ * @param[in] grad_output_desc
+ *   Input. The descriptor of the gradient with respect to the output tensor.
+ *   For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] grad_output
+ *   Input. Pointer to the MLU memory that stores the gradient with respect to the output tensor.
+ * @param[in] workspace
+ *   Input. Pointer to the MLU memory that is used as an extra workspace for the
+ *   deformable convolution backward filter operation. For more information about workspace,
+ *   see "Cambricon MLUOP User Guide".
+ * @param[in] workspace_size
+ *   Input. The size of the extra workspace in bytes needed for
+ *   the deformable convolution backward filter operation. You can get the size of the workspace
+ *   with the ::mluOpGetDCNBackwardWeightWorkspaceSize function.
+ * @param[in] grad_filter_desc
+ *   Input. The descriptor of the gradient with respect to the filter tensor.
+ *   For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[out] grad_filter
+ *   Output. Pointer to the MLU memory that stores the gradient with respect to the filter tensor.
+ * @param[in] grad_bias_desc
+ *   Input. The descriptor of the gradient with respect to the bias tensor. Set this parameter to NULL if the
+ *   gradient of the bias tensor is not needed. For detailed information,
+ *   see ::mluOpTensorDescriptor_t.
+ * @param[out] grad_bias
+ *   Output. Pointer to the MLU memory that stores the gradient with respect to the bias tensor.
+ *   Set this parameter to NULL if the gradient of the bias tensor is not needed.
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM,
+ *   ::MLUOP_STATUS_NOT_SUPPORTED, ::MLUOP_STATUS_NUMERICAL_OVERFLOW
+ *
+ * @par Formula
+ * - See "Deformable Convolution Operator" section in "Cambricon MLUOP User Guide" for details.
+ *
+ * @par Data Type
+ * - The off-chip data type of \p input, \p offset, \p mask, \p grad_output, \p grad_filter,
+ *   and \p grad_bias must be the same.
+ * - The supported off-chip data types of the input tensor and output tensor are as follows:
+ *   - input, offset, mask, grad_output, grad_filter, grad_bias, grad_mask: half, float.
+ * - This function supports any combinations of the following on-chip data types for input tensor
+ *   \p grad_output and \p input on MLU200 series and CE3226.
+ *   - \p grad_output on-chip data type: int16, int31.
+ *   - \p filter on-chip data type: int16, int31.
+ * - \p grad_output off-chip data type can be combined with any supported on-chip data types.
+ * - \p input off-chip data type can be combined with any supported on-chip data types.
+ * - This function also supports floating-point computation on MLU300 series or above. To perform
+ *   floating-point computation, the on-chip data type of \p input and \p grad_output should be
+ *   \p MLUOP_DTYPE_INVALID or the same as the corresponding off-chip data type.
+ *
+ * @par Data Layout
+ * - The data layout of the input, offset, mask, grad_output, and grad_filter
+ *   should be \p MLUOP_LAYOUT_NHWC.
+ * - The data layout of grad_bias should be \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - The input, offset, mask, grad_output, grad_filter, grad_bias and
+ *   the deformable convolution descriptor
+ *   (including pad, stride, dilation, deformable_group, conv_group, im2col_step) must meet the
+ *   following requirements:
+ *   - input tensor: \p batch > 0, \p height > 0, \p width > 0, \p channel > 0
+ *   - offset tensor: \p batch should be equal to the batch of input tensor, \p height and \p width
+ *     should be equal to the height and width of output tensor. \p channel should be equal to
+      deformable_group  filter_height  filter_width  2.
+ *   - mask tensor: When mask is needed, \p batch should be equal to the batch of input tensor,
+ *     \p height and \p width should be equal to the height and width of output tensor.
+      \p channel should be equal to deformable_group  filter_height * filter_width.
+ *   - grad bias tensor: When the gradient of bias is needed, the \p grad_bias should be a
+ *     one-dimensional array with the length of \p out_channel.
+    - The value of (im2col_step  out_height  out_filter  filter_h  filter_w  input_channel)
+ *     should be less than or equal to the INT_MAX defined in limits.h.
+
+ * @par API Dependency
+ * - Before calling this function to implement the backward filter of deformable convolution,
+ *   you need to prepare all the parameters passed to this function. See each parameter
+ *   description for details.
+ *
+ * @par Performance Optimization
+ * - To achieve better performance, set the im2col_step to the batch size.
+ *
+ * @par Note
+ * - The alignment of \p input, \p offset, \p mask, \p grad_output, \p grad_filter, \p grad_bias
+ *   should be contiguous in the MLU memory.
+ *
+ * @par Requirements
+ * - None.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - https://github.com/msracver/Deformable-ConvNets
+ * - Deformable Convolutional Networks, Jifeng Dai, et al., 2017.
+ * - Deformable ConvNets v2: More Deformable, Better Results, Xizhou Zhu, et al., 2018.
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpDCNBackwardWeight(mluOpHandle_t handle,
+                       const mluOpDCNDescriptor_t dcn_desc,
+                       const mluOpTensorDescriptor_t input_desc,
+                       const void *input,
+                       const mluOpTensorDescriptor_t offset_desc,
+                       const void *offset,
+                       const mluOpTensorDescriptor_t mask_desc,
+                       const void *mask,
+                       const mluOpTensorDescriptor_t grad_output_desc,
+                       const void *grad_output,
+                       void *workspace,
+                       size_t workspace_size,
+                       const mluOpTensorDescriptor_t grad_filter_desc,
+                       void *grad_filter,
+                       const mluOpTensorDescriptor_t grad_bias_desc,
+                       void *grad_bias);
+
 // Group:DCN
 /*!
  * @brief Returns in \p workspace_size the size of the MLU memory that is used as an extra
@@ -12203,7 +12667,7 @@ mluOpGetDCNBakcwardDataWorkspaceSize(mluOpHandle_t handle,
  * @param[in] offset
  *   Input. Pointer to the MLU memory that stores the offset tensor.
  * @param[in] mask_desc
- *   Input. The descriptor of the scaling factor to be applied for each position in the convolution
+ *   Input. The descriptor of the scaling factor to be applied to each position in the convolution
  *   kernel. The shape of mask must be (batch, out_height, out_width,
  *   deformable_group * filter_height * filter_width). Set this parameter to NULL when
  *   mask is not requested. For detailed information, see ::mluOpTensorDescriptor_t.
@@ -12225,7 +12689,7 @@ mluOpGetDCNBakcwardDataWorkspaceSize(mluOpHandle_t handle,
  *   deformable convolution backward data operation. For more information about workspace,
  *   see "Cambricon MLU-OPS User Guide".
  * @param[in] workspace_size
- *   Input. The size of the extra workspace in bytes that needs to be used in
+ *   Input. The size of the extra workspace in bytes needed for
  *   the deformable convolution backward data operation. You can get the size of the workspace
  *   with the ::mluOpGetDCNBakcwardDataWorkspaceSize function.
  * @param[in] grad_input_desc
@@ -12265,8 +12729,8 @@ mluOpGetDCNBakcwardDataWorkspaceSize(mluOpHandle_t handle,
  *   - \p filter onchip data type: int16, int31.
  * - \p grad_output offchip data type can be combined with any supported onchip data types.
  * - \p filter offchip data type can be combined with any supported onchip data types.
- * - This function also supports float-point computation on MLU300 series or above. To perform
- *   float-point computation, the onchip data type of \p grad_output and \p filter must be
+ * - This function also supports floating-point computation on MLU300 series or above. To perform
+ *   floating-point computation, the onchip data type of \p grad_output and \p filter must be
  *   \p MLUOP_DTYPE_INVALID or the same as the corresponding offchip data type.
  *
  * @par Data Layout
diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp
new file mode 100644
index 000000000..3a807147d
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.cpp
@@ -0,0 +1,136 @@
+/*************************************************************************
+ * Copyright (C) [2022] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "transpose_cpu.h"
+#include <vector>
+#include "core/tensor.h"
+
+
+template <typename T>
+static void transposeCpuNd(const int loop_d, T *x, T *y, const uint64_t sum,
+                           uint64_t *dim, uint64_t *DIM, uint64_t *permute) {
+  for (int loop_t = 0; loop_t < loop_d; loop_t++) {
+    T *output = (T *)(y + sum * loop_t);
+    T *input = (T *)(x + sum * loop_t);
+    uint64_t in_index = 0, out_index = 0;
+
+    for (dim[0] = 0; dim[0] < DIM[0]; dim[0]++) {
+      for (dim[1] = 0; dim[1] < DIM[1]; dim[1]++) {
+        for (dim[2] = 0; dim[2] < DIM[2]; dim[2]++) {
+          for (dim[3] = 0; dim[3] < DIM[3]; dim[3]++) {
+            for (dim[4] = 0; dim[4] < DIM[4]; dim[4]++) {
+              for (dim[5] = 0; dim[5] < DIM[5]; dim[5]++) {
+                for (dim[6] = 0; dim[6] < DIM[6]; dim[6]++) {
+                  for (dim[7] = 0; dim[7] < DIM[7]; dim[7]++) {
+                    in_index =
+                        dim[0] * DIM[1] * DIM[2] * DIM[3] * DIM[4] * DIM[5] *
+                            DIM[6] * DIM[7] +
+                        dim[1] * DIM[2] * DIM[3] * DIM[4] * DIM[5] * DIM[6] *
+                            DIM[7] +
+                        dim[2] * DIM[3] * DIM[4] * DIM[5] * DIM[6] * DIM[7] +
+                        dim[3] * DIM[4] * DIM[5] * DIM[6] * DIM[7] +
+                        dim[4] * DIM[5] * DIM[6] * DIM[7] +
+                        dim[5] * DIM[6] * DIM[7] + dim[6] * DIM[7] + dim[7];
+                    out_index =
+                        dim[permute[0]] * DIM[permute[1]] * DIM[permute[2]] *
+                            DIM[permute[3]] * DIM[permute[4]] *
+                            DIM[permute[5]] * DIM[permute[6]] *
+                            DIM[permute[7]] +
+                        dim[permute[1]] * DIM[permute[2]] * DIM[permute[3]] *
+                            DIM[permute[4]] * DIM[permute[5]] *
+                            DIM[permute[6]] * DIM[permute[7]] +
+                        dim[permute[2]] * DIM[permute[3]] * DIM[permute[4]] *
+                            DIM[permute[5]] * DIM[permute[6]] *
+                            DIM[permute[7]] +
+                        dim[permute[3]] * DIM[permute[4]] * DIM[permute[5]] *
+                            DIM[permute[6]] * DIM[permute[7]] +
+                        dim[permute[4]] * DIM[permute[5]] * DIM[permute[6]] *
+                            DIM[permute[7]] +
+                        dim[permute[5]] * DIM[permute[6]] * DIM[permute[7]] +
+                        dim[permute[6]] * DIM[permute[7]] + dim[permute[7]];
+                    output[out_index] = input[in_index];
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+mluOpStatus_t mluOpTransposeCpu(const int64_t dim_desc,
+                                const std::vector<int> permute_desc,
+                                const mluOpTensorDescriptor_t x_desc,
+                                const void *x,
+                                const mluOpTensorDescriptor_t y_desc, void *y) {
+  PARAM_CHECK("[cnnlTransposeCpu]", x_desc != NULL);
+  PARAM_CHECK("[cnnlTransposeCpu]", y_desc != NULL);
+  uint64_t sum = mluOpGetTensorElementNum(x_desc);
+  // zero elements, return success
+  if (sum == 0 || x_desc->dim == 0 || y_desc->dim == 0) {
+    VLOG(5) << "cnnlTransposeCpu:: zero elements, return success.";
+    return MLUOP_STATUS_SUCCESS;
+  }
+  PARAM_CHECK("[cnnlTransposeCpu]", x != NULL);
+  PARAM_CHECK("[cnnlTransposeCpu]", y != NULL);
+
+  const uint64_t dim_all = dim_desc;
+  auto data_type = x_desc->dtype;
+  int loop_d = 1;
+  if (data_type == MLUOP_DTYPE_INT31) {
+    loop_d = 2;
+  }
+  // do not change the inited value(8) in permute
+  // 8 is used to match TRANSPOSE_MAX_DIM, which can make the loop below
+  // applies to all-dims transpose, from 2D transpose to 8D transpose
+  // if you change macro TRANSPOSE_MAX_DIM, the inited value(8) should alse be
+  // changed to TRANSPOSE_MAX_DIM. And the loop level should be equal to
+  // TRANSPOSE_MAX_DIM
+  uint64_t permute[TRANSPOSE_MAX_DIM] = {8, 8, 8, 8, 8, 8, 8, 8};
+  uint64_t DIM[TRANSPOSE_MAX_DIM + 1] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  uint64_t dim[TRANSPOSE_MAX_DIM + 1] = {0};
+
+  if (x_desc->dim != dim_all || y_desc->dim != dim_all) {
+    LOG(ERROR)
+        << "cnnlTransposeCpu: dimension information mismatch, dim of x: "
+        << x_desc->dim << ", dim of y: " << y_desc->dim
+        << ", dim of descriptor: " << dim_all;
+    return MLUOP_STATUS_BAD_PARAM;
+  }
+
+  for (int i = 0; i < dim_all; i++) {
+    permute[i] = permute_desc[i];
+    DIM[i] = x_desc->dims[i];
+  }
+  if (MLUOP_DTYPE_INT31 == data_type) {
+    transposeCpuNd(loop_d, (int16_t *)x, (int16_t *)y, sum, dim, DIM, permute);
+  } else if (MLUOP_DTYPE_COMPLEX_HALF == data_type ||
+             MLUOP_DTYPE_COMPLEX_FLOAT == data_type) {
+    transposeCpuNd(loop_d, (double *)x, (double *)y, sum, dim, DIM, permute);
+  } else {
+    transposeCpuNd(loop_d, (float *)x, (float *)y, sum, dim, DIM, permute);
+  }
+  return MLUOP_STATUS_SUCCESS;
+}
diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.h b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.h
new file mode 100644
index 000000000..198fb58d8
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/transpose_cpu/transpose_cpu.h
@@ -0,0 +1,39 @@
+/*************************************************************************
+ * Copyright (C) [2022] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef TEST_MLU_OP_GTEST_SRC_INTERNAL_KERNEL_TRANSPOSE_CPU_TRANSPOSE_CPU_H_
+#define TEST_MLU_OP_GTEST_SRC_INTERNAL_KERNEL_TRANSPOSE_CPU_TRANSPOSE_CPU_H_
+
+#include <vector>
+#include "core/tensor.h"
+#include "kernels/kernel.h"
+#include "kernels/debug.h"
+
+#define TRANSPOSE_MAX_DIM 8
+
+mluOpStatus_t mluOpTransposeCpu(const int64_t dim,
+                                const std::vector<int> permute,
+                                const mluOpTensorDescriptor_t x_desc,
+                                const void *x,
+                                const mluOpTensorDescriptor_t y_desc, void *y);
+
+#endif  // TEST_MLU_OP_GTEST_SRC_INTERNAL_KERNEL_TRANSPOSE_CPU_TRANSPOSE_CPU_H_
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp
new file mode 100644
index 000000000..b29e9a525
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp
@@ -0,0 +1,672 @@
+/*************************************************************************
+ * Copyright (C) [2019-2022] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "dcn_backward_weight.h"
+#include "internal_kernel/transpose_cpu/transpose_cpu.h"
+
+#define USE_OPENBLAS 0
+
+#if USE_OPENBLAS
+#include <openblas/cblas.h>
+#endif
+
+namespace mluoptest {
+// input      :[N,hi,wi,ci]
+// offset     :[N,ho,wo,dg*kh*kw*2]
+// mask       :[N,ho,wo,dg*kh*kw] // optional
+// grad_ouput :[N,ho,wo,co]
+// grad_weight:[co,kh,kw,ci/g]
+// grad_bias  :[co] // optional
+static inline bool isFixData(mluOpDataType_t type) {
+  if (MLUOP_DTYPE_INT8 == type || MLUOP_DTYPE_INT16 == type ||
+      MLUOP_DTYPE_INT31 == type) {
+    return true;
+  }
+  return false;
+}
+
+int DcnBackwardWeightExecutor::getCoefficientOfLT2CT() {
+  auto input_dtype =
+      cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).dtype());
+  int lt_compute_force = 0;
+  int ct_compute_force = input_dtype == MLUOP_DTYPE_FLOAT ? 32 : 64;
+  if (input_dtype == MLUOP_DTYPE_FLOAT) {
+    lt_compute_force = 2 * 1.5 * 1024;
+  } else {
+    lt_compute_force = 2 * 0.375 * 1024;
+  }
+  return lt_compute_force / ct_compute_force;
+}
+
+void DcnBackwardWeightExecutor::paramCheck() {
+  if (parser_->getInputNum() != 3 && parser_->getInputNum() != 4) {
+    LOG(ERROR) << "DCN_Backward_Weight: tensor input number is wrong.";
+  }
+
+  auto dtype =
+      cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).onchip_dtype());
+  input_onchip_dtype = dtype;
+  if (isFixData(dtype)) {
+    parser_->input(0)->oc_dt = MLUOP_DTYPE_INVALID;
+  }
+
+  if (parser_->getInputNum() == 3) {
+    dtype =
+        cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(2).onchip_dtype());
+    grad_output_onchip_dtype = dtype;
+    if (isFixData(dtype)) {
+      parser_->input(2)->oc_dt = MLUOP_DTYPE_INVALID;
+    }
+  } else {
+    dtype =
+        cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(3).onchip_dtype());
+    grad_output_onchip_dtype = dtype;
+    if (isFixData(dtype)) {
+      parser_->input(3)->oc_dt = MLUOP_DTYPE_INVALID;
+    }
+  }
+
+  if (!parser_->getProtoNode()->has_dcn_param()) {
+    LOG(ERROR) << "Missing dcn param. ";
+  }
+
+  if (parser_->getOutputNum() != 1 && parser_->getOutputNum() != 2) {
+    LOG(ERROR) << "DCN_Backward_Weight tensor output number is wrong.";
+  }
+  TensorLayout input_order = parser_->getProtoNode()->input(0).layout();
+  if (input_order != LAYOUT_NHWC) {
+    LOG(ERROR) << "DCN_Backward_Weight input tensor layout should be NHWC.";
+  }
+
+  int N = parser_->getProtoNode()->input(0).shape().dims(0);
+  int ci = parser_->getProtoNode()->input(0).shape().dims(3);
+  int co = parser_->getProtoNode()->output(0).shape().dims(0);
+
+  auto dcn_param = parser_->getProtoNode()->dcn_param();
+  dimnb = dcn_param.dimnb();
+  for (int i = 0; i < dcn_param.pad_size(); ++i) {
+    pad[i] = dcn_param.pad(i);
+  }
+  for (int i = 0; i < dcn_param.stride_size(); ++i) {
+    stride[i] = dcn_param.stride(i);
+  }
+  for (int i = 0; i < dcn_param.dilation_size(); ++i) {
+    dilation[i] = dcn_param.dilation(i);
+  }
+  if (dcn_param.has_deformable_group()) {
+    dg = dcn_param.deformable_group();
+  }
+  if (dcn_param.has_conv_group()) {
+    g = dcn_param.conv_group();
+  }
+  if (dcn_param.has_im2col_step()) {
+    im2col_step = dcn_param.im2col_step();
+  }
+
+  if (dimnb != 4) {
+    LOG(ERROR) << "[DCN_Backward_Weight]: dimnb should be 4.";
+  }
+
+  if (ci % dg) {
+    LOG(ERROR) << "[DCN_Backward_Weight]: deformable_group is wrong.";
+  }
+
+  if (ci % g) {
+    LOG(ERROR) << "[DCN_Backward_Weight]: conv_group is wrong.";
+  }
+
+  if (co % g) {
+    LOG(ERROR) << "[DCN_Backward_Weight]: conv_group is wrong.";
+  }
+
+  if (N % im2col_step) {
+    LOG(ERROR) << "[DCN_Backward_Weight]: im2col_step is wrong.";
+  }
+}
+
+void DcnBackwardWeightExecutor::workspaceMalloc() {
+  input_desc = tensor_desc_[0].tensor;
+  offset_desc = tensor_desc_[1].tensor;
+  mluOpDataType_t compute_type;
+  auto dcn_param = parser_->getProtoNode()->dcn_param();
+  if (dcn_param.has_compute_type()) {
+    compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type());
+  } else {
+    compute_type = MLUOP_DTYPE_FLOAT;
+  }
+
+  mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate(
+      mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor);
+
+  MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg,
+                                    g, im2col_step, compute_type));
+
+  if (parser_->getInputNum() == 3) {
+    mask_desc = nullptr;
+    grad_output_desc = tensor_desc_[2].tensor;
+    grad_weight_desc = tensor_desc_[3].tensor;
+    grad_bias_desc =
+        parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor;
+  } else {
+    mask_desc = tensor_desc_[2].tensor;
+    grad_output_desc = tensor_desc_[3].tensor;
+    grad_weight_desc = tensor_desc_[4].tensor;
+    grad_bias_desc =
+        parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor;
+  }
+
+  input_desc->onchip_dtype = input_onchip_dtype;
+  grad_output_desc->onchip_dtype = grad_output_onchip_dtype;
+  MLUOP_CHECK(mluOpGetDCNBackwardWeightWorkspaceSize(
+      handle_, dcn_desc, input_desc, offset_desc, mask_desc, grad_output_desc,
+      grad_weight_desc, grad_bias_desc, &workspace_size));
+
+  if (workspace_size != 0) {
+    workspace = mlu_runtime_.allocate(workspace_size);
+  }
+
+  eva_->setMluWorkspaceSize(workspace_size);
+  cpu_runtime_.deallocate(dcn_desc);
+}
+
+void DcnBackwardWeightExecutor::workspaceFree() {
+  if (workspace != nullptr) {
+    mlu_runtime_.deallocate(workspace);
+  }
+}
+
+void DcnBackwardWeightExecutor::compute() {
+  input_desc = tensor_desc_[0].tensor;
+  offset_desc = tensor_desc_[1].tensor;
+  mluOpDataType_t compute_type;
+  auto dcn_param = parser_->getProtoNode()->dcn_param();
+  if (dcn_param.has_compute_type()) {
+    compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type());
+  } else {
+    compute_type = input_desc->dtype;
+  }
+
+  mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate(
+      mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor);
+
+  MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg,
+                                    g, im2col_step, compute_type));
+
+  input = data_vector_[0].device_ptr;
+  offset = data_vector_[1].device_ptr;
+  if (parser_->getInputNum() == 3) {
+    mask_desc = nullptr;
+    mask = nullptr;
+    grad_output_desc = tensor_desc_[2].tensor;
+    grad_output = data_vector_[2].device_ptr;
+    grad_weight_desc = tensor_desc_[3].tensor;
+    grad_weight = data_vector_[3].device_ptr;
+    grad_bias_desc =
+        parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor;
+    grad_bias =
+        parser_->getOutputNum() == 1 ? nullptr : data_vector_[4].device_ptr;
+  } else {
+    mask_desc = tensor_desc_[2].tensor;
+    mask = data_vector_[2].device_ptr;
+    grad_output_desc = tensor_desc_[3].tensor;
+    grad_output = data_vector_[3].device_ptr;
+    grad_weight_desc = tensor_desc_[4].tensor;
+    grad_weight = data_vector_[4].device_ptr;
+    grad_bias_desc =
+        parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor;
+    grad_bias =
+        parser_->getOutputNum() == 1 ? nullptr : data_vector_[5].device_ptr;
+  }
+
+  input_desc->onchip_dtype = input_onchip_dtype;
+  grad_output_desc->onchip_dtype = grad_output_onchip_dtype;
+
+  VLOG(4) << "call mluOpDCNBackwardWeight()";
+  interface_timer_.start();
+  MLUOP_CHECK(mluOpDCNBackwardWeight(
+      handle_, dcn_desc, input_desc, input, offset_desc, offset, mask_desc,
+      mask, grad_output_desc, grad_output, workspace, workspace_size,
+      grad_weight_desc, grad_weight, grad_bias_desc, grad_bias));
+
+  interface_timer_.stop();
+  cpu_runtime_.deallocate(dcn_desc);
+}
+
+static float bilinear(float *input_ptr, const int &ci_offset, const int &hi,
+                      const int &wi, const int &ci, const float &h_in,
+                      const float &w_in) {
+  int h_low = floor(h_in);
+  int w_low = floor(w_in);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h_in - h_low;
+  float lw = w_in - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0, v2 = 0, v3 = 0, v4 = 0;
+
+  if (h_low >= 0 && w_low >= 0) {
+    v1 = input_ptr[(h_low * wi + w_low) * ci + ci_offset];
+  }
+
+  if (h_low >= 0 && w_high <= wi - 1) {
+    v2 = input_ptr[(h_low * wi + w_high) * ci + ci_offset];
+  }
+
+  if (h_high <= hi - 1 && w_low >= 0) {
+    v3 = input_ptr[(h_high * wi + w_low) * ci + ci_offset];
+  }
+
+  if (h_high <= hi - 1 && w_high <= wi - 1) {
+    v4 = input_ptr[(h_high * wi + w_high) * ci + ci_offset];
+  }
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  float val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+  return val;
+}
+
+static void im2col(const int &N, const int &im2col_step, const int &dg,
+                   const int &hi, const int &wi, const int &ci, const int &ho,
+                   const int &wo, const int &co, const int &kh, const int &kw,
+                   const int &pt, const int &pb, const int &pl, const int &pr,
+                   const int &sh, const int &sw, const int &dh, const int &dw,
+                   const float *cpu_input, const float *cpu_offset,
+                   const float *cpu_mask, float *buffer) {
+  // input      :[N,hi,wi,ci]
+  // offset     :[N,ho,wo,dg*kh*kw*2]
+  // mask       :[N,ho,wo,dg*kh*kw] // optional
+  // grad_ouput :[N,ho,wo,co]
+  // grad_weight:[co,kh,kw,ci/g]
+  // grad_bias  :[co] // optional
+  for (int idx_n = 0; idx_n < im2col_step; ++idx_n) {
+    for (int idx_ho = 0; idx_ho < ho; ++idx_ho) {
+      for (int idx_wo = 0; idx_wo < wo; ++idx_wo) {
+        float *input_ptr = (float *)cpu_input + idx_n * hi * wi * ci;
+        float *offset_ptr =
+            (float *)cpu_offset +
+            ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw * 2;
+        float *mask_ptr =
+            cpu_mask != nullptr
+                ? (float *)cpu_mask +
+                      ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw
+                : nullptr;
+        float *columns_ptr =
+            (float *)buffer +
+            ((idx_n * ho + idx_ho) * wo + idx_wo) * kh * kw * ci;
+        const int hi_start = idx_ho * sh - pt;
+        const int wi_start = idx_wo * sw - pl;
+        for (int idx_kh = 0; idx_kh < kh; ++idx_kh) {
+          for (int idx_kw = 0; idx_kw < kw; ++idx_kw) {
+            for (int idx_dg = 0; idx_dg < dg; ++idx_dg) {
+              const int data_offset_h =
+                  ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2;
+              const int data_offset_w =
+                  ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2 + 1;
+              const int data_mask = (idx_dg * kh + idx_kh) * kw + idx_kw;
+              const float offset_h = offset_ptr[data_offset_h];
+              const float offset_w = offset_ptr[data_offset_w];
+              const float mask =
+                  mask_ptr != nullptr ? mask_ptr[data_mask] : 1.0f;
+              const float h_in = hi_start + idx_kh * dh + offset_h;
+              const float w_in = wi_start + idx_kw * dw + offset_w;
+              if (h_in > -1 && w_in > -1 && h_in < hi && w_in < wi) {
+                for (int idx_ci = 0; idx_ci < ci / dg; ++idx_ci) {
+                  const int ci_offset = idx_dg * ci / dg + idx_ci;
+                  const int columns_offset =
+                      (idx_kh * kw + idx_kw) * ci + ci_offset;
+                  columns_ptr[columns_offset] =
+                      bilinear(input_ptr, ci_offset, hi, wi, ci, h_in, w_in) *
+                      mask;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void DcnBackwardWeightExecutor::transpose(float *input, float *output,
+                                          const int dims[], const int dim_num,
+                                          int permute[]) {
+  int64_t dim_desc = dim_num;
+  std::vector<int> permute_desc;
+  if (dim_desc > 8 || dim_desc <= 0) {
+    LOG(ERROR) << "dim_desc is " << dim_desc
+               << ", it shoule less than 8 and greater than 0";
+  }
+  { std::vector<int>().swap(permute_desc); }
+  for (int i = 0; i < dim_num; i++) {
+    permute_desc.push_back(permute[i]);
+  }
+  mluOpTensorDescriptor_t input_desc, output_desc;
+  input_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor,
+                                     mluOpDestroyTensorDescriptor);
+  output_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor,
+                                      mluOpDestroyTensorDescriptor);
+
+  int dims_trans[4];
+  for (int i = 0; i < dim_num; ++i) {
+    dims_trans[i] = dims[permute[i]];
+  }
+
+  MLUOP_CHECK(mluOpSetTensorDescriptor(input_desc, MLUOP_LAYOUT_ARRAY,
+                                       MLUOP_DTYPE_FLOAT, dim_num, dims));
+  MLUOP_CHECK(mluOpSetTensorDescriptor(output_desc, MLUOP_LAYOUT_ARRAY,
+                                       MLUOP_DTYPE_FLOAT, dim_num, dims_trans));
+
+  MLUOP_CHECK(mluOpTransposeCpu(dim_desc, permute_desc, input_desc, input,
+                                output_desc, output));
+  cpu_runtime_.deallocate(input_desc);
+  cpu_runtime_.deallocate(output_desc);
+}
+
+static void BatchMatMul(const int &g, const int &m, const int &k, const int &n,
+                        float *input_a, float *input_b, float *output,
+                        bool is_transa, bool is_transb) {
+  const int batch_size = g;
+
+  assert(batch_size >= 1);
+#if USE_OPENBLAS
+  const CBLAS_ORDER Order = CblasRowMajor;
+  const CBLAS_TRANSPOSE TransA = is_transa ? CblasTrans : CblasNoTrans;
+  const CBLAS_TRANSPOSE TransB = is_transb ? CblasTrans : CblasNoTrans;
+
+  int lda = is_transa ? m : k;
+  int ldb = is_transb ? k : n;
+  int ldc = n;
+
+  float alpha = 1.0f;
+  float beta = 1.0f;
+#else
+  auto matmul = [](float *lhs, float *rhs, float *output, bool is_trans_a,
+                   bool is_trans_b, int M, int N, int K) {
+    for (int m = 0; m < M; m++) {
+      for (int n = 0; n < N; n++) {
+        // output[m * N + n] = 0.0f;
+        for (int k = 0; k < K; k++) {
+          int lhs_idx = m * K + k;
+          if (is_trans_a) lhs_idx = k * M + m;
+          int rhs_idx = k * N + n;
+          if (is_trans_b) rhs_idx = n * K + k;
+          output[m * N + n] += lhs[lhs_idx] * rhs[rhs_idx];
+        }
+      }
+    }
+  };
+#endif
+  for (int i = 0; i < batch_size; ++i) {
+#if USE_OPENBLAS
+    cblas_sgemm(Order, TransA, TransB, m, n, k, alpha, input_a + i * m * k, lda,
+                input_b + i * k * n, ldb, beta, output + i * m * n, ldc);
+#else
+    matmul(input_a + i * m * k, input_b + i * k * n, output + i * m * n,
+           is_transa, is_transb, m, n, k);
+#endif
+  }
+}
+
+static void dealBias(float *cpu_grad_output, float *cpu_grad_bias, const int &N,
+                     const int &ho, const int &wo, const int &co) {
+  for (int idx_n = 0; idx_n < N; ++idx_n) {
+    for (int idx_ho = 0; idx_ho < ho; ++idx_ho) {
+      for (int idx_wo = 0; idx_wo < wo; ++idx_wo) {
+        for (int idx_co = 0; idx_co < co; ++idx_co) {
+          cpu_grad_bias[idx_co] +=
+              cpu_grad_output[((idx_n * ho + idx_ho) * wo + idx_wo) * co +
+                              idx_co];
+        }
+      }
+    }
+  }
+}
+
+void DcnBackwardWeightExecutor::computeDCNBackwardWeightCPU(
+    const int &dg, const int &g, const int &im2col_step,
+    const mluOpTensorDescriptor_t input_desc, const void *cpu_input,
+    const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset,
+    const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask,
+    const mluOpTensorDescriptor_t grad_output_desc, const void *cpu_grad_output,
+    const mluOpTensorDescriptor_t grad_weight_desc, void *cpu_grad_weight,
+    const mluOpTensorDescriptor_t grad_bias_desc, void *cpu_grad_bias,
+    float *buffer, int pad[], int stride[], int dilation[],
+    int64_t &theory_ops) {
+  const int N = input_desc->dims[0];
+  const int hi = input_desc->dims[1];
+  const int wi = input_desc->dims[2];
+  const int ci = input_desc->dims[3];
+  const int ho = offset_desc->dims[1];
+  const int wo = offset_desc->dims[2];
+  const int co = grad_output_desc->dims[3];
+  const int kh = grad_weight_desc->dims[1];
+  const int kw = grad_weight_desc->dims[2];
+  const int pt = pad[0];
+  const int pb = pad[1];
+  const int pl = pad[2];
+  const int pr = pad[3];
+  const int sh = stride[0];
+  const int sw = stride[1];
+  const int dh = dilation[0];
+  const int dw = dilation[1];
+
+  int coeff = getCoefficientOfLT2CT();
+  if (g == 1) {
+    // buffer: |  columns_a  |
+    for (int i = 0; i < N / im2col_step; ++i) {
+      float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci;
+      float *offset_i =
+          (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2;
+      float *mask_i =
+          cpu_mask != nullptr
+              ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw
+              : nullptr;
+      float *grad_output_i =
+          (float *)cpu_grad_output + i * im2col_step * ho * wo * co;
+      // 1.im2col
+      memset(buffer, 0, (im2col_step * ho * wo * kh * kw * ci) * sizeof(float));
+      im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr,
+             sh, sw, dh, dw, input_i, offset_i, mask_i, (float *)buffer);
+      theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci *
+                    15;  // bilinear(14) + mask(1)
+
+      float *input_a = grad_output_i;
+      float *input_b = buffer;
+      const int k = im2col_step * ho * wo;
+      const int m = co;
+      const int n = kh * kw * ci;
+      // 2.BMM
+      BatchMatMul(g, m, k, n, input_a, input_b, (float *)cpu_grad_weight, true,
+                  false);
+      theory_ops += 2 * (int64_t)g * m * k * n / coeff;  // lt2ct
+    }
+  } else {
+    // |  columns_a  |  columns_b  |  grad_output  |
+    float *buffer_columns_a = buffer;
+    float *buffer_columns_b =
+        buffer_columns_a + im2col_step * ho * wo * kh * kw * ci;
+    float *buffer_grad_output =
+        buffer_columns_b + im2col_step * ho * wo * kh * kw * ci;
+    for (int i = 0; i < N / im2col_step; ++i) {
+      float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci;
+      float *offset_i =
+          (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2;
+      float *mask_i =
+          cpu_mask != nullptr
+              ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw
+              : nullptr;
+      float *grad_output_i =
+          (float *)cpu_grad_output + i * im2col_step * ho * wo * co;
+      // 1.im2col
+      memset(buffer, 0, (im2col_step * ho * wo * kh * kw * ci) * sizeof(float));
+      im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr,
+             sh, sw, dh, dw, input_i, offset_i, mask_i, buffer_columns_a);
+      theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci *
+                    15;  // bilinear_count + mask
+
+      // 2.split columns [im2col_step*ho*wo*kh*kw,g,
+      // ci/g]->[g,im2col_step*ho*wo*kh*kw,ci/g]
+      int dims_1[3] = {im2col_step * ho * wo * kh * kw, g, ci / g};
+      int permute_1[3] = {1, 0, 2};
+      transpose(buffer_columns_a, buffer_columns_b, dims_1, 3, permute_1);
+      theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci;
+
+      // 3.transpose grad_output [im2col_step*ho*wo,co]->
+      // [g,co/g,im2col_step*ho*wo]
+      int dims_2[2] = {im2col_step * ho * wo, co};
+      int permute_2[2] = {1, 0};
+      transpose(grad_output_i, buffer_grad_output, dims_2, 2, permute_2);
+      theory_ops += (int64_t)im2col_step * ho * wo * co;
+
+      float *input_a = buffer_grad_output;
+      float *input_b = buffer_columns_b;
+      const int k = im2col_step * ho * wo;
+      const int m = co / g;
+      const int n = kh * kw * ci / g;
+
+      // 4.BMM
+      BatchMatMul(g, m, k, n, input_a, input_b, (float *)cpu_grad_weight, false,
+                  false);
+      theory_ops += 2 * (int64_t)g * m * k * n / coeff;  // lt2ct
+    }
+  }
+  // 5.grad_bias
+  if (cpu_grad_bias) {
+    dealBias((float *)cpu_grad_output, (float *)cpu_grad_bias, N, ho, wo, co);
+    theory_ops += (int64_t)N * ho * wo * co;
+  }
+}
+
+void DcnBackwardWeightExecutor::cpuCompute() {
+  input_desc = tensor_desc_[0].tensor;
+  offset_desc = tensor_desc_[1].tensor;
+  cpu_input = cpu_fp32_input_[0];
+  cpu_offset = cpu_fp32_input_[1];
+  if (parser_->getInputNum() == 3) {
+    mask_desc = nullptr;
+    cpu_mask = nullptr;
+    grad_output_desc = tensor_desc_[2].tensor;
+    cpu_grad_output = cpu_fp32_input_[2];
+
+    grad_weight_desc = tensor_desc_[3].tensor;
+    cpu_grad_weight = cpu_fp32_output_[0];
+    grad_bias_desc =
+        parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor;
+    cpu_grad_bias =
+        parser_->getOutputNum() == 1 ? nullptr : cpu_fp32_output_[1];
+  } else {
+    mask_desc = tensor_desc_[2].tensor;
+    cpu_mask = cpu_fp32_input_[2];
+    grad_output_desc = tensor_desc_[3].tensor;
+    cpu_grad_output = cpu_fp32_input_[3];
+    grad_weight_desc = tensor_desc_[4].tensor;
+    cpu_grad_weight = cpu_fp32_output_[0];
+    grad_bias_desc =
+        parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor;
+    cpu_grad_bias =
+        parser_->getOutputNum() == 1 ? nullptr : cpu_fp32_output_[1];
+  }
+
+  const int ho = offset_desc->dims[1];
+  const int wo = offset_desc->dims[2];
+  const int kh = grad_weight_desc->dims[1];
+  const int kw = grad_weight_desc->dims[2];
+  const int ci = input_desc->dims[3];
+  const int co = grad_output_desc->dims[3];
+
+  size_t cpu_buffer_size = 0;
+  if (g == 1) {
+    cpu_buffer_size =
+        (static_cast<size_t>(im2col_step) * ho * wo * kh * kw * ci) *
+        sizeof(float);
+  } else {
+    cpu_buffer_size = (2lu * im2col_step * ho * wo * kh * kw * ci +
+                       im2col_step * ho * wo * co) *
+                      sizeof(float);
+  }
+
+  float *buffer = nullptr;
+  buffer = (float *)cpu_runtime_.allocate(cpu_buffer_size);
+  if (buffer == nullptr) {
+    LOG(ERROR) << "dcn_backward_weight: allocate buffer failed.";
+  }
+  if (cpu_grad_weight) {
+    memset(cpu_grad_weight, 0, co * kh * kw * ci / g * sizeof(float));
+  }
+  if (cpu_grad_bias) {
+    memset(cpu_grad_bias, 0, co * sizeof(float));
+  }
+  theory_ops = 0;
+  computeDCNBackwardWeightCPU(
+      dg, g, im2col_step, input_desc, cpu_input, offset_desc, cpu_offset,
+      mask_desc, cpu_mask, grad_output_desc, cpu_grad_output, grad_weight_desc,
+      cpu_grad_weight, grad_bias_desc, cpu_grad_bias, buffer, pad, stride,
+      dilation, theory_ops);
+
+  cpu_runtime_.deallocate(buffer);
+}
+
+int64_t DcnBackwardWeightExecutor::getTheoryOps() {
+  if (exe_config_->mlu_only) {
+    theory_ops = 0;
+    input_desc = tensor_desc_[0].tensor;
+    offset_desc = tensor_desc_[1].tensor;
+    if (parser_->getInputNum() == 3) {
+      grad_output_desc = tensor_desc_[2].tensor;
+      grad_weight_desc = tensor_desc_[3].tensor;
+      grad_bias_desc =
+          parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[4].tensor;
+    } else {
+      grad_output_desc = tensor_desc_[3].tensor;
+      grad_weight_desc = tensor_desc_[4].tensor;
+      grad_bias_desc =
+          parser_->getOutputNum() == 1 ? nullptr : tensor_desc_[5].tensor;
+    }
+    const int N = input_desc->dims[0];
+    const int hi = input_desc->dims[1];
+    const int wi = input_desc->dims[2];
+    const int ci = input_desc->dims[3];
+    const int ho = offset_desc->dims[1];
+    const int wo = offset_desc->dims[2];
+    const int co = grad_output_desc->dims[3];
+    const int kh = grad_weight_desc->dims[1];
+    const int kw = grad_weight_desc->dims[2];
+    int coeff = getCoefficientOfLT2CT();
+    const int k = im2col_step * ho * wo;
+    const int m = co / g;
+    const int n = kh * kw * ci / g;
+    if (g == 1) {
+      for (int i = 0; i < N / im2col_step; ++i) {
+        theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci *
+                      15;  // bilinear(14) + mask(1)
+        theory_ops += 2 * (int64_t)g * m * k * n / coeff;  // lt2ct
+      }
+    } else {
+      for (int i = 0; i < N / im2col_step; ++i) {
+        theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci *
+                      15;  // bilinear_count + mask
+        theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci;
+        theory_ops += (int64_t)im2col_step * ho * wo * co;
+        theory_ops += 2 * (int64_t)g * m * k * n / coeff;  // lt2ct
+      }
+    }
+    if (grad_bias_desc) {
+      theory_ops += (int64_t)N * ho * wo * co;
+    }
+  }
+  VLOG(4) << "getTheoryOps: " << theory_ops << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.h b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.h
new file mode 100755
index 000000000..a193499d5
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.h
@@ -0,0 +1,86 @@
+/*************************************************************************
+ * Copyright (C) [2019-2022] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef TEST_MLUOP_GTEST_SRC_ZOO_DCN_BACKWARD_WEIGHT_DCN_BACKWARD_WEIGHT_H_
+#define TEST_MLUOP_GTEST_SRC_ZOO_DCN_BACKWARD_WEIGHT_DCN_BACKWARD_WEIGHT_H_
+
+#include <vector>
+#include "executor.h"
+
+namespace mluoptest {
+
+class DcnBackwardWeightExecutor : public Executor {
+ public:
+  DcnBackwardWeightExecutor() {}
+  ~DcnBackwardWeightExecutor() {}
+
+  void workspaceMalloc();
+  void workspaceFree();
+  void paramCheck();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+
+ private:
+  void transpose(float *input, float *output, const int dims[],
+                 const int dim_num, int permute[]);
+  int getCoefficientOfLT2CT();
+  void computeDCNBackwardWeightCPU(
+      const int &dg, const int &g, const int &im2col_step,
+      const mluOpTensorDescriptor_t input_desc, const void *cpu_input,
+      const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset,
+      const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask,
+      const mluOpTensorDescriptor_t grad_output_desc,
+      const void *cpu_grad_output,
+      const mluOpTensorDescriptor_t grad_weight_desc, void *cpu_grad_weight,
+      const mluOpTensorDescriptor_t grad_bias_desc, void *cpu_grad_bias,
+      float *buffer, int pad[], int stride[], int dilation[],
+      int64_t &theory_ops);
+
+  mluOpDataType_t input_onchip_dtype;
+  mluOpDataType_t grad_output_onchip_dtype;
+
+  mluOpTensorDescriptor_t input_desc;
+  mluOpTensorDescriptor_t offset_desc;
+  mluOpTensorDescriptor_t mask_desc = nullptr;  // optional
+  mluOpTensorDescriptor_t grad_output_desc;
+  mluOpTensorDescriptor_t grad_weight_desc;
+  mluOpTensorDescriptor_t grad_bias_desc = nullptr;  // optional
+
+  int dimnb;
+  int pad[4];
+  int stride[2];
+  int dilation[2];
+  int dg;
+  int g;
+  int im2col_step;
+
+  void *input = nullptr;
+  void *offset = nullptr;
+  void *mask = nullptr;
+  void *grad_output = nullptr;
+  void *grad_weight = nullptr;
+  void *grad_bias = nullptr;
+
+  void *cpu_input = nullptr;
+  void *cpu_offset = nullptr;
+  void *cpu_mask = nullptr;
+  void *cpu_grad_output = nullptr;
+  void *cpu_grad_weight = nullptr;
+  void *cpu_grad_bias = nullptr;
+
+  void *workspace = nullptr;
+  size_t workspace_size = 0;
+  int64_t theory_ops = 0;
+};
+
+}  // namespace mluoptest
+#endif  // TEST_MLUOP_GTEST_SRC_ZOO_DCN_BACKWARD_WEIGHT_DCN_BACKWARD_WEIGHT_H_
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/test_case/case_hi_16.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/test_case/case_hi_16.prototxt
new file mode 100755
index 000000000..1f57195e6
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/test_case/case_hi_16.prototxt
@@ -0,0 +1,112 @@
+op_name: "dcn_backward_weight"
+op_type: DCN_BACKWARD_WEIGHT
+input {
+  id: "input"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 300
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 2.4
+    lower_bound: -2.2
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "offset"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 36
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 3.4
+    lower_bound: -2.9
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mask"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 18
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 1
+    lower_bound: 0
+    distribution: UNIFORM
+  }
+}
+
+input {
+  id: "grad_output"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 300
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 1
+    lower_bound: -1
+    distribution: UNIFORM
+  }
+}
+output {
+  id: "grad_weight"
+  shape: {
+    dims: 300
+    dims: 3
+    dims: 3
+    dims: 100
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+}
+output {
+  id: "grad_bias"
+  shape: {
+    dims: 300
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+dcn_param: {
+  dimnb: 4
+  pad: 1
+  pad: 1
+  pad: 1
+  pad: 1
+  stride: 1
+  stride: 1
+  dilation: 1
+  dilation: 1
+  deformable_group: 2
+  conv_group: 3
+  im2col_step: 1
+  compute_type: 2 
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp
new file mode 100644
index 000000000..9a210931d
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp
@@ -0,0 +1,701 @@
+/*************************************************************************
+ * Copyright (C) [2019-2022] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "dcn_forward.h"
+#include "internal_kernel/transpose_cpu/transpose_cpu.h"
+
+#define USE_OPENBLAS 0
+
+#if USE_OPENBLAS
+#include <openblas/cblas.h>
+#endif
+
+namespace mluoptest {
+// input :[N,hi,wi,ci]
+// offset:[N,ho,wo,dg*kh*kw*2]
+// mask  :[N,ho,wo,dg*kh*kw] // optional
+// weight:[co,kh,kw,ci/g]
+// bias  :[co]               // optional
+// ouput :[N,ho,wo,co]
+
+int DcnForwardExecutor::getCoefficientOfLT2CT() {
+  auto input_dtype =
+      cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).dtype());
+  int lt_compute_force = 0;
+  int ct_compute_force = input_dtype == MLUOP_DTYPE_FLOAT ? 32 : 64;
+  if (input_dtype == MLUOP_DTYPE_FLOAT) {
+    lt_compute_force = 2 * 1.5 * 1024;
+  } else {
+    lt_compute_force = 2 * 0.375 * 1024;
+  }
+  return lt_compute_force / ct_compute_force;
+}
+
+void DcnForwardExecutor::paramCheck() {
+  if (parser_->getInputNum() != 3 && parser_->getInputNum() != 4 &&
+      parser_->getInputNum() != 5) {
+    LOG(ERROR) << "DCN_Forward tensor input number is wrong.";
+  }
+
+  auto dtype =
+      cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(0).onchip_dtype());
+  input_onchip_dtype = dtype;
+
+  if (parser_->getInputNum() == 3 ||
+      parser_->getProtoNode()->input(3).shape().dims_size() == 1) {
+    dtype =
+        cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(2).onchip_dtype());
+    weight_onchip_dtype = dtype;
+  } else {
+    dtype =
+        cvtProtoDtypeToMluOp(parser_->getProtoNode()->input(3).onchip_dtype());
+    weight_onchip_dtype = dtype;
+  }
+
+  if (!parser_->getProtoNode()->has_dcn_param()) {
+    LOG(ERROR) << "Missing dcn param. ";
+  }
+
+  if (parser_->getOutputNum() != 1) {
+    LOG(ERROR) << "DCN_Forward tensor output number is wrong.";
+  }
+  TensorLayout input_order = parser_->getProtoNode()->input(0).layout();
+  if (input_order != LAYOUT_NHWC) {
+    LOG(ERROR) << "DCN_Forward input tensor layout should be NHWC.";
+  }
+
+  int N = parser_->getProtoNode()->input(0).shape().dims(0);
+  int ci = parser_->getProtoNode()->input(0).shape().dims(3);
+  int co = parser_->getProtoNode()->output(0).shape().dims(3);
+
+  auto dcn_param = parser_->getProtoNode()->dcn_param();
+  dimnb = dcn_param.dimnb();
+  for (int i = 0; i < dcn_param.pad_size(); ++i) {
+    pad[i] = dcn_param.pad(i);
+  }
+  for (int i = 0; i < dcn_param.stride_size(); ++i) {
+    stride[i] = dcn_param.stride(i);
+  }
+  for (int i = 0; i < dcn_param.dilation_size(); ++i) {
+    dilation[i] = dcn_param.dilation(i);
+  }
+  if (dcn_param.has_deformable_group()) {
+    dg = dcn_param.deformable_group();
+  }
+  if (dcn_param.has_conv_group()) {
+    g = dcn_param.conv_group();
+  }
+  if (dcn_param.has_im2col_step()) {
+    im2col_step = dcn_param.im2col_step();
+  }
+
+  if (dimnb != 4) {
+    LOG(ERROR) << "[DCN_Forward]: dimnb should be 4.";
+  }
+
+  if (ci % dg) {
+    LOG(ERROR) << "[DCN_Forward]: deformable_group is wrong.";
+  }
+
+  if (ci % g) {
+    LOG(ERROR) << "[DCN_Forward]: conv_group is wrong.";
+  }
+
+  if (co % g) {
+    LOG(ERROR) << "[DCN_Forward]: conv_group is wrong.";
+  }
+
+  if (N % im2col_step) {
+    LOG(ERROR) << "[DCN_Forward]: im2col_step is wrong.";
+  }
+}
+
+void DcnForwardExecutor::workspaceMalloc() {
+  input_desc = tensor_desc_[0].tensor;
+  offset_desc = tensor_desc_[1].tensor;
+  mluOpDataType_t compute_type;
+  auto dcn_param = parser_->getProtoNode()->dcn_param();
+  if (dcn_param.has_compute_type()) {
+    compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type());
+  } else {
+    compute_type = MLUOP_DTYPE_FLOAT;
+  }
+  mluOpDataType_t compute_type1 = compute_type;
+  mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate(
+      mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor);
+  MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg,
+                                    g, im2col_step, compute_type1));
+
+  if (parser_->getInputNum() == 3) {
+    mask_desc = nullptr;
+    weight_desc = tensor_desc_[2].tensor;
+    bias_desc = nullptr;
+    output_desc = tensor_desc_[3].tensor;
+  } else if (parser_->getInputNum() == 4) {
+    if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) {
+      mask_desc = tensor_desc_[2].tensor;
+      weight_desc = tensor_desc_[3].tensor;
+      bias_desc = nullptr;
+      output_desc = tensor_desc_[4].tensor;
+    } else {
+      mask_desc = nullptr;
+      weight_desc = tensor_desc_[2].tensor;
+      bias_desc = tensor_desc_[3].tensor;
+      output_desc = tensor_desc_[4].tensor;
+    }
+  } else {
+    mask_desc = tensor_desc_[2].tensor;
+    weight_desc = tensor_desc_[3].tensor;
+    bias_desc = tensor_desc_[4].tensor;
+    output_desc = tensor_desc_[5].tensor;
+  }
+
+  input_desc->onchip_dtype = input_onchip_dtype;
+  weight_desc->onchip_dtype = weight_onchip_dtype;
+
+  MLUOP_CHECK(mluOpGetDCNForwardWorkspaceSize(
+      handle_, dcn_desc, input_desc, offset_desc, mask_desc, weight_desc,
+      bias_desc, output_desc, &workspace_size));
+
+  if (workspace_size != 0) {
+    workspace = mlu_runtime_.allocate(workspace_size);
+  }
+
+  eva_->setMluWorkspaceSize(workspace_size);
+  cpu_runtime_.deallocate(dcn_desc);
+}
+
+void DcnForwardExecutor::workspaceFree() {
+  if (workspace != nullptr) {
+    mlu_runtime_.deallocate(workspace);
+  }
+}
+
+void DcnForwardExecutor::compute() {
+  input_desc = tensor_desc_[0].tensor;
+  offset_desc = tensor_desc_[1].tensor;
+  mluOpDataType_t compute_type;
+  auto dcn_param = parser_->getProtoNode()->dcn_param();
+  if (dcn_param.has_compute_type()) {
+    compute_type = cvtProtoDtypeToMluOp(dcn_param.compute_type());
+  } else {
+    compute_type = MLUOP_DTYPE_FLOAT;
+  }
+  mluOpDataType_t compute_type2 = compute_type;
+  mluOpDCNDescriptor_t dcn_desc = cpu_runtime_.allocate(
+      mluOpCreateDCNDescriptor, mluOpDestroyDCNDescriptor);
+  MLUOP_CHECK(mluOpSetDCNDescriptor(dcn_desc, dimnb, pad, stride, dilation, dg,
+                                    g, im2col_step, compute_type2));
+  input = data_vector_[0].device_ptr;
+  offset = data_vector_[1].device_ptr;
+  if (parser_->getInputNum() == 3) {
+    mask_desc = nullptr;
+    mask = nullptr;
+    weight_desc = tensor_desc_[2].tensor;
+    weight = data_vector_[2].device_ptr;
+    bias_desc = nullptr;
+    bias = nullptr;
+    output_desc = tensor_desc_[3].tensor;
+    output = data_vector_[3].device_ptr;
+  } else if (parser_->getInputNum() == 4) {
+    if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) {
+      mask_desc = tensor_desc_[2].tensor;
+      mask = data_vector_[2].device_ptr;
+      weight_desc = tensor_desc_[3].tensor;
+      weight = data_vector_[3].device_ptr;
+      bias_desc = nullptr;
+      bias = nullptr;
+      output_desc = tensor_desc_[4].tensor;
+      output = data_vector_[4].device_ptr;
+    } else {
+      mask_desc = nullptr;
+      mask = nullptr;
+      weight_desc = tensor_desc_[2].tensor;
+      weight = data_vector_[2].device_ptr;
+      bias_desc = tensor_desc_[3].tensor;
+      bias = data_vector_[3].device_ptr;
+      output_desc = tensor_desc_[4].tensor;
+      output = data_vector_[4].device_ptr;
+    }
+  } else {
+    mask_desc = tensor_desc_[2].tensor;
+    mask = data_vector_[2].device_ptr;
+    weight_desc = tensor_desc_[3].tensor;
+    weight = data_vector_[3].device_ptr;
+    bias_desc = tensor_desc_[4].tensor;
+    bias = data_vector_[4].device_ptr;
+    output_desc = tensor_desc_[5].tensor;
+    output = data_vector_[5].device_ptr;
+  }
+
+  input_desc->onchip_dtype = input_onchip_dtype;
+  weight_desc->onchip_dtype = weight_onchip_dtype;
+  VLOG(4) << "call mluOpDCNForward()";
+  interface_timer_.start();
+
+  MLUOP_CHECK(mluOpDCNForward(handle_, dcn_desc, input_desc, input, offset_desc,
+                              offset, mask_desc, mask, weight_desc, weight,
+                              bias_desc, bias, workspace, workspace_size,
+                              output_desc, output));
+
+  interface_timer_.stop();
+  cpu_runtime_.deallocate(dcn_desc);
+}
+
+static float bilinear(float *input_ptr, const int &ci_offset, const int &hi,
+                      const int &wi, const int &ci, const float &h_in,
+                      const float &w_in) {
+  int h_low = floor(h_in);
+  int w_low = floor(w_in);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h_in - h_low;
+  float lw = w_in - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0, v2 = 0, v3 = 0, v4 = 0;
+
+  if (h_low >= 0 && w_low >= 0) {
+    v1 = input_ptr[(h_low * wi + w_low) * ci + ci_offset];
+  }
+
+  if (h_low >= 0 && w_high <= wi - 1) {
+    v2 = input_ptr[(h_low * wi + w_high) * ci + ci_offset];
+  }
+
+  if (h_high <= hi - 1 && w_low >= 0) {
+    v3 = input_ptr[(h_high * wi + w_low) * ci + ci_offset];
+  }
+
+  if (h_high <= hi - 1 && w_high <= wi - 1) {
+    v4 = input_ptr[(h_high * wi + w_high) * ci + ci_offset];
+  }
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  float val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+  return val;
+}
+
+static void im2col(const int &N, const int &im2col_step, const int &dg,
+                   const int &hi, const int &wi, const int &ci, const int &ho,
+                   const int &wo, const int &co, const int &kh, const int &kw,
+                   const int &pt, const int &pb, const int &pl, const int &pr,
+                   const int &sh, const int &sw, const int &dh, const int &dw,
+                   const float *cpu_input, const float *cpu_offset,
+                   const float *cpu_mask, float *buffer) {
+  for (int idx_n = 0; idx_n < im2col_step; ++idx_n) {
+    for (int idx_ho = 0; idx_ho < ho; ++idx_ho) {
+      for (int idx_wo = 0; idx_wo < wo; ++idx_wo) {
+        float *input_ptr = (float *)cpu_input + idx_n * hi * wi * ci;
+        float *offset_ptr =
+            (float *)cpu_offset +
+            ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw * 2;
+        float *mask_ptr =
+            cpu_mask != nullptr
+                ? (float *)cpu_mask +
+                      ((idx_n * ho + idx_ho) * wo + idx_wo) * dg * kh * kw
+                : nullptr;
+        float *columns_ptr =
+            (float *)buffer +
+            ((idx_n * ho + idx_ho) * wo + idx_wo) * kh * kw * ci;
+        const int hi_start = idx_ho * sh - pt;
+        const int wi_start = idx_wo * sw - pl;
+        for (int idx_kh = 0; idx_kh < kh; ++idx_kh) {
+          for (int idx_kw = 0; idx_kw < kw; ++idx_kw) {
+            for (int idx_dg = 0; idx_dg < dg; ++idx_dg) {
+              const int data_offset_h =
+                  ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2;
+              const int data_offset_w =
+                  ((idx_dg * kh + idx_kh) * kw + idx_kw) * 2 + 1;
+              const int data_mask = (idx_dg * kh + idx_kh) * kw + idx_kw;
+              const float offset_h = offset_ptr[data_offset_h];
+              const float offset_w = offset_ptr[data_offset_w];
+              const float mask =
+                  mask_ptr != nullptr ? mask_ptr[data_mask] : 1.0f;
+              const float h_in = hi_start + idx_kh * dh + offset_h;
+              const float w_in = wi_start + idx_kw * dw + offset_w;
+              if (h_in > -1 && w_in > -1 && h_in < hi && w_in < wi) {
+                for (int idx_ci = 0; idx_ci < ci / dg; ++idx_ci) {
+                  const int ci_offset = idx_dg * ci / dg + idx_ci;
+                  const int columns_offset =
+                      (idx_kh * kw + idx_kw) * ci + ci_offset;
+                  columns_ptr[columns_offset] =
+                      bilinear(input_ptr, ci_offset, hi, wi, ci, h_in, w_in) *
+                      mask;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void DcnForwardExecutor::transpose(float *input, float *output,
+                                   const int dims[], const int dim_num,
+                                   const int permute[]) {
+  // cnnlTransposeDescriptor_t trans_desc;
+  int64_t dim_desc = dim_num;
+  std::vector<int> permute_desc;
+  if (dim_desc > 8 || dim_desc <= 0) {
+    LOG(ERROR) << "dim_desc is " << dim_desc
+               << ", it shoule less than 8 and greater than 0";
+  }
+  { std::vector<int>().swap(permute_desc); }
+  for (int i = 0; i < dim_num; i++) {
+    permute_desc.push_back(permute[i]);
+  }
+  mluOpTensorDescriptor_t input_desc, output_desc;
+  input_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor,
+                                     mluOpDestroyTensorDescriptor);
+  output_desc = cpu_runtime_.allocate(mluOpCreateTensorDescriptor,
+                                      mluOpDestroyTensorDescriptor);
+  int dims_trans[4];
+  for (int i = 0; i < dim_num; ++i) {
+    dims_trans[i] = dims[permute[i]];
+  }
+
+  MLUOP_CHECK(mluOpSetTensorDescriptor(input_desc, MLUOP_LAYOUT_ARRAY,
+                                       MLUOP_DTYPE_FLOAT, dim_num, dims));
+  MLUOP_CHECK(mluOpSetTensorDescriptor(output_desc, MLUOP_LAYOUT_ARRAY,
+                                       MLUOP_DTYPE_FLOAT, dim_num, dims_trans));
+
+  MLUOP_CHECK(mluOpTransposeCpu(dim_desc, permute_desc, input_desc, input,
+                                output_desc, output));
+  cpu_runtime_.deallocate(input_desc);
+  cpu_runtime_.deallocate(output_desc);
+}
+
+static void BatchMatMul(const int &g, const int &m, const int &k, const int &n,
+                        float *input_a, float *input_b, float *output,
+                        const bool is_transa, const bool is_transb) {
+  const int batch_size = g;
+
+  assert(batch_size >= 1);
+#if USE_OPENBLAS
+  const CBLAS_ORDER Order = CblasRowMajor;
+  const CBLAS_TRANSPOSE TransA = is_transa ? CblasTrans : CblasNoTrans;
+  const CBLAS_TRANSPOSE TransB = is_transb ? CblasTrans : CblasNoTrans;
+
+  int lda = is_transa ? m : k;
+  int ldb = is_transb ? k : n;
+  int ldc = n;
+
+  float alpha = 1.0f;
+  float beta = 1.0f;
+#else
+  auto matmul = [](float *lhs, float *rhs, float *output, bool is_trans_a,
+                   bool is_trans_b, int M, int N, int K) {
+    for (int m = 0; m < M; m++) {
+      for (int n = 0; n < N; n++) {
+        // output[m * N + n] = 0.0f;
+        for (int k = 0; k < K; k++) {
+          int lhs_idx = m * K + k;
+          if (is_trans_a) lhs_idx = k * M + m;
+          int rhs_idx = k * N + n;
+          if (is_trans_b) rhs_idx = n * K + k;
+          output[m * N + n] += lhs[lhs_idx] * rhs[rhs_idx];
+        }
+      }
+    }
+  };
+#endif
+  for (int i = 0; i < batch_size; ++i) {
+#if USE_OPENBLAS
+    cblas_sgemm(Order, TransA, TransB, m, n, k, alpha, input_a + i * m * k, lda,
+                input_b + i * k * n, ldb, beta, output + i * m * n, ldc);
+#else
+    matmul(input_a + i * m * k, input_b + i * k * n, output + i * m * n,
+           is_transa, is_transb, m, n, k);
+#endif
+  }
+}
+
+static void dealBias(float *cpu_output, float *cpu_bias, const int &N,
+                     const int &ho, const int &wo, const int &co) {
+  for (int idx_n = 0; idx_n < N; ++idx_n) {
+    for (int idx_ho = 0; idx_ho < ho; ++idx_ho) {
+      for (int idx_wo = 0; idx_wo < wo; ++idx_wo) {
+        for (int idx_co = 0; idx_co < co; ++idx_co) {
+          cpu_output[((idx_n * ho + idx_ho) * wo + idx_wo) * co + idx_co] +=
+              cpu_bias[idx_co];
+        }
+      }
+    }
+  }
+}
+
+void DcnForwardExecutor::computeDCNForwardCPU(
+    const int &dg, const int &g, const int &im2col_step,
+    const mluOpTensorDescriptor_t input_desc, const void *cpu_input,
+    const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset,
+    const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask,
+    const mluOpTensorDescriptor_t weight_desc, const void *cpu_weight,
+    const mluOpTensorDescriptor_t bias_desc, const void *cpu_bias,
+    const mluOpTensorDescriptor_t output_desc, const void *cpu_output,
+    float *buffer, int pad[], int stride[], int dilation[],
+    int64_t &theory_ops) {
+  const int N = input_desc->dims[0];
+  const int hi = input_desc->dims[1];
+  const int wi = input_desc->dims[2];
+  const int ci = input_desc->dims[3];
+  const int ho = offset_desc->dims[1];
+  const int wo = offset_desc->dims[2];
+  const int co = output_desc->dims[3];
+  const int kh = weight_desc->dims[1];
+  const int kw = weight_desc->dims[2];
+  const int pt = pad[0];
+  const int pb = pad[1];
+  const int pl = pad[2];
+  const int pr = pad[3];
+  const int sh = stride[0];
+  const int sw = stride[1];
+  const int dh = dilation[0];
+  const int dw = dilation[1];
+  int coeff = getCoefficientOfLT2CT();
+  if (g == 1) {
+    for (int i = 0; i < N / im2col_step; ++i) {
+      float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci;
+      float *offset_i =
+          (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2;
+      float *mask_i =
+          cpu_mask != nullptr
+              ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw
+              : nullptr;
+      float *output_i = (float *)cpu_output + i * im2col_step * ho * wo * co;
+      // 1.im2col
+      memset(buffer, 0, (im2col_step * ho * wo * kh * kw * ci) * sizeof(float));
+      im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr,
+             sh, sw, dh, dw, input_i, offset_i, mask_i, buffer);
+      theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci *
+                    15;  // bilinear_count + mask
+
+      // 2.BMM
+      float *input_a = buffer;
+      float *input_b = (float *)cpu_weight;
+      const int k = kh * kw * ci / g;
+      const int m = im2col_step * ho * wo;
+      const int n = co / g;
+      memset(output_i, 0, (im2col_step * ho * wo * co) * sizeof(float));
+      BatchMatMul(g, m, k, n, input_a, input_b, (float *)output_i, false, true);
+      theory_ops += 2 * (int64_t)g * m * k * n / coeff;
+    }
+  } else {
+    // buffer:| columns_a  | columns_b  |  output |
+    float *buffer_columns_a = buffer;
+    float *buffer_columns_b =
+        buffer_columns_a + im2col_step * ho * wo * kh * kw * ci;
+    float *buffer_output =
+        buffer_columns_b + im2col_step * ho * wo * kh * kw * ci;
+
+    for (int i = 0; i < N / im2col_step; ++i) {
+      float *input_i = (float *)cpu_input + i * im2col_step * hi * wi * ci;
+      float *offset_i =
+          (float *)cpu_offset + i * im2col_step * ho * wo * dg * kh * kw * 2;
+      float *mask_i =
+          cpu_mask != nullptr
+              ? (float *)cpu_mask + i * im2col_step * ho * wo * dg * kh * kw
+              : nullptr;
+      float *output_i = (float *)cpu_output + i * im2col_step * ho * wo * co;
+      // 1.im2col
+      memset(buffer_columns_a, 0,
+             (im2col_step * ho * wo * kh * kw * ci) * sizeof(float));
+      im2col(N, im2col_step, dg, hi, wi, ci, ho, wo, co, kh, kw, pt, pb, pl, pr,
+             sh, sw, dh, dw, input_i, offset_i, mask_i, buffer_columns_a);
+      theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci *
+                    15;  // bilinear_count + mask
+
+      // 2.split columns
+      // [im2col_step*ho*wo*kh*kw,ci]->[g,im2col_step*ho*wo*kh*kw,ci/g]
+      int dims_1[3] = {im2col_step * ho * wo * kh * kw, g, ci / g};
+      int permute_1[3] = {1, 0, 2};
+      transpose(buffer_columns_a, buffer_columns_b, dims_1, 3, permute_1);
+      theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci;
+
+      // 3.BMM
+      float *input_a = buffer_columns_b;
+      float *input_b = (float *)cpu_weight;
+      const int k = kh * kw * ci / g;
+      const int m = im2col_step * ho * wo;
+      const int n = co / g;
+      memset(buffer_output, 0, (im2col_step * ho * wo * co) * sizeof(float));
+      BatchMatMul(g, m, k, n, input_a, input_b, buffer_output, false, true);
+      theory_ops += 2 * (int64_t)g * m * k * n / coeff;
+
+      // 4.transpose output [g,im2col_step*ho*wo, co/g]->[im2col_step*ho*wo, g,
+      // co/g]
+      int dims_2[3] = {g, im2col_step * ho * wo, co / g};
+      int permute_2[3] = {1, 0, 2};
+      transpose(buffer_output, (float *)output_i, dims_2, 3, permute_2);
+      theory_ops += (int64_t)im2col_step * ho * wo * co;
+    }
+  }
+
+  if (cpu_bias) {
+    dealBias((float *)cpu_output, (float *)cpu_bias, N, ho, wo, co);
+    theory_ops += (int64_t)N * ho * wo * co;
+  }
+}
+
+void DcnForwardExecutor::cpuCompute() {
+  input_desc = tensor_desc_[0].tensor;
+  offset_desc = tensor_desc_[1].tensor;
+  cpu_input = cpu_fp32_input_[0];
+  cpu_offset = cpu_fp32_input_[1];
+  if (parser_->getInputNum() == 3) {
+    mask_desc = nullptr;
+    cpu_mask = nullptr;
+    weight_desc = tensor_desc_[2].tensor;
+    cpu_weight = cpu_fp32_input_[2];
+    bias_desc = nullptr;
+    cpu_bias = nullptr;
+    output_desc = tensor_desc_[3].tensor;
+    cpu_output = cpu_fp32_output_[0];
+  } else if (parser_->getInputNum() == 4) {
+    if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) {
+      mask_desc = tensor_desc_[2].tensor;
+      cpu_mask = cpu_fp32_input_[2];
+      weight_desc = tensor_desc_[3].tensor;
+      cpu_weight = cpu_fp32_input_[3];
+      bias_desc = nullptr;
+      cpu_bias = nullptr;
+      output_desc = tensor_desc_[4].tensor;
+      cpu_output = cpu_fp32_output_[0];
+    } else {
+      mask_desc = nullptr;
+      cpu_mask = nullptr;
+      weight_desc = tensor_desc_[2].tensor;
+      cpu_weight = cpu_fp32_input_[2];
+      bias_desc = tensor_desc_[3].tensor;
+      cpu_bias = cpu_fp32_input_[3];
+      output_desc = tensor_desc_[4].tensor;
+      cpu_output = cpu_fp32_output_[0];
+    }
+  } else {
+    mask_desc = tensor_desc_[2].tensor;
+    cpu_mask = cpu_fp32_input_[2];
+    weight_desc = tensor_desc_[3].tensor;
+    cpu_weight = cpu_fp32_input_[3];
+    bias_desc = tensor_desc_[4].tensor;
+    cpu_bias = cpu_fp32_input_[4];
+    output_desc = tensor_desc_[5].tensor;
+    cpu_output = cpu_fp32_output_[0];
+  }
+
+  const int ho = offset_desc->dims[1];
+  const int wo = offset_desc->dims[2];
+  const int kh = weight_desc->dims[1];
+  const int kw = weight_desc->dims[2];
+  const int ci = input_desc->dims[3];
+  const int co = output_desc->dims[3];
+
+  size_t cpu_buffer_size = 0;
+  if (g == 1) {
+    cpu_buffer_size =
+        (static_cast<size_t>(im2col_step) * ho * wo * kh * kw * ci) *
+        sizeof(float);
+  } else {
+    cpu_buffer_size = (2lu * im2col_step * ho * wo * kh * kw * ci +
+                       im2col_step * ho * wo * co) *
+                      sizeof(float);
+  }
+
+  float *buffer = nullptr;
+  buffer = (float *)cpu_runtime_.allocate(cpu_buffer_size);
+  if (buffer == nullptr) {
+    LOG(ERROR) << "dcn_forward: allocate buffer failed.";
+  }
+  theory_ops = 0;
+  computeDCNForwardCPU(dg, g, im2col_step, input_desc, cpu_input, offset_desc,
+                       cpu_offset, mask_desc, cpu_mask, weight_desc, cpu_weight,
+                       bias_desc, cpu_bias, output_desc, cpu_output, buffer,
+                       pad, stride, dilation, theory_ops);
+
+  cpu_runtime_.deallocate(buffer);
+}
+
+int64_t DcnForwardExecutor::getTheoryOps() {
+  if (exe_config_->mlu_only) {
+    theory_ops = 0;
+
+    input_desc = tensor_desc_[0].tensor;
+    offset_desc = tensor_desc_[1].tensor;
+    if (parser_->getInputNum() == 3) {
+      weight_desc = tensor_desc_[2].tensor;
+      bias_desc = nullptr;
+      output_desc = tensor_desc_[3].tensor;
+    } else if (parser_->getInputNum() == 4) {
+      if (parser_->getProtoNode()->input(3).shape().dims_size() == 4) {
+        weight_desc = tensor_desc_[3].tensor;
+        bias_desc = nullptr;
+        output_desc = tensor_desc_[4].tensor;
+      } else {
+        weight_desc = tensor_desc_[2].tensor;
+        bias_desc = tensor_desc_[3].tensor;
+        output_desc = tensor_desc_[4].tensor;
+      }
+    } else {
+      weight_desc = tensor_desc_[3].tensor;
+      bias_desc = tensor_desc_[4].tensor;
+      output_desc = tensor_desc_[5].tensor;
+    }
+
+    const int N = input_desc->dims[0];
+    const int hi = input_desc->dims[1];
+    const int wi = input_desc->dims[2];
+    const int ci = input_desc->dims[3];
+    const int ho = offset_desc->dims[1];
+    const int wo = offset_desc->dims[2];
+    const int co = output_desc->dims[3];
+    const int kh = weight_desc->dims[1];
+    const int kw = weight_desc->dims[2];
+    int coeff = getCoefficientOfLT2CT();
+    const int k = kh * kw * ci / g;
+    const int m = im2col_step * ho * wo;
+    const int n = co / g;
+    if (g == 1) {
+      for (int i = 0; i < N / im2col_step; ++i) {
+        // 1.im2col
+        // bilinear_count + mask
+        theory_ops +=
+            (int64_t)im2col_step * ho * wo * kh * kw * (dg * 7 + ci * 7);
+        // 2.BMM
+        theory_ops += 2 * (int64_t)g * m * k * n / coeff;
+      }
+    } else {
+      for (int i = 0; i < N / im2col_step; ++i) {
+        // 1.im2col
+        // bilinear_count + mask
+        theory_ops +=
+            (int64_t)im2col_step * ho * wo * kh * kw * (dg * 7 + ci * 7);
+        // 2.split columns
+        // [im2col_step*ho*wo*kh*kw,ci]->[g,im2col_step*ho*wo*kh*kw,ci/g]
+        theory_ops += (int64_t)im2col_step * ho * wo * kh * kw * ci;
+        // 3.BMM
+        theory_ops += 2 * (int64_t)g * m * k * n / coeff;
+        // 4.transpose output [g,im2col_step*ho*wo, co/g]->[im2col_step*ho*wo,
+        // g, co/g]
+        theory_ops += (int64_t)im2col_step * ho * wo * co;
+      }
+    }
+
+    if (bias_desc) {
+      theory_ops += (int64_t)N * ho * wo * co;
+    }
+  }
+  VLOG(4) << "getTheoryOps: " << theory_ops << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.h b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.h
new file mode 100755
index 000000000..ad0dda9e0
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.h
@@ -0,0 +1,84 @@
+/*************************************************************************
+ * Copyright (C) [2019-2022] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef TEST_MLUOP_GTEST_SRC_ZOO_DCN_FORWARD_DCN_FORWARD_H_
+#define TEST_MLUOP_GTEST_SRC_ZOO_DCN_FORWARD_DCN_FORWARD_H_
+
+#include <vector>
+#include "executor.h"
+
+namespace mluoptest {
+
+class DcnForwardExecutor : public Executor {
+ public:
+  DcnForwardExecutor() {}
+  ~DcnForwardExecutor() {}
+
+  void workspaceMalloc();
+  void workspaceFree();
+  void paramCheck();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+
+ private:
+  int getCoefficientOfLT2CT();
+  void transpose(float *input, float *output, const int dims[],
+                 const int dim_num, const int permute[]);
+  void computeDCNForwardCPU(
+      const int &dg, const int &g, const int &im2col_step,
+      const mluOpTensorDescriptor_t input_desc, const void *cpu_input,
+      const mluOpTensorDescriptor_t offset_desc, const void *cpu_offset,
+      const mluOpTensorDescriptor_t mask_desc, const void *cpu_mask,
+      const mluOpTensorDescriptor_t weight_desc, const void *cpu_weight,
+      const mluOpTensorDescriptor_t bias_desc, const void *cpu_bias,
+      const mluOpTensorDescriptor_t output_desc, const void *cpu_output,
+      float *buffer, int pad[], int stride[], int dilation[],
+      int64_t &theory_ops);
+  mluOpDataType_t input_onchip_dtype;
+  mluOpDataType_t weight_onchip_dtype;
+
+  mluOpTensorDescriptor_t input_desc;
+  mluOpTensorDescriptor_t offset_desc;
+  mluOpTensorDescriptor_t mask_desc = nullptr;  // optional
+  mluOpTensorDescriptor_t output_desc;
+  mluOpTensorDescriptor_t weight_desc;
+  mluOpTensorDescriptor_t bias_desc = nullptr;  // optional
+
+  int dimnb;
+  int pad[4];
+  int stride[2];
+  int dilation[2];
+  int dg;
+  int g;
+  int im2col_step;
+
+  void *input = nullptr;
+  void *offset = nullptr;
+  void *mask = nullptr;
+  void *output = nullptr;
+  void *weight = nullptr;
+  void *bias = nullptr;
+
+  void *cpu_input = nullptr;
+  void *cpu_offset = nullptr;
+  void *cpu_mask = nullptr;
+  void *cpu_output = nullptr;
+  void *cpu_weight = nullptr;
+  void *cpu_bias = nullptr;
+
+  void *workspace = nullptr;
+  size_t workspace_size = 0;
+  int64_t theory_ops = 0;
+};
+
+}  // namespace mluoptest
+#endif  // TEST_MLUOP_GTEST_SRC_ZOO_DCN_FORWARD_DCN_FORWARD_H_
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/test_case/case_hi_16.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/test_case/case_hi_16.prototxt
new file mode 100755
index 000000000..12765af0f
--- /dev/null
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/test_case/case_hi_16.prototxt
@@ -0,0 +1,117 @@
+op_name: "dcn_forward"
+op_type: DCN_FORWARD
+input {
+  id: "input"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 300
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 1
+    lower_bound: -1
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "offset"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 36
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 1
+    lower_bound: -1
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mask"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 18
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 1
+    lower_bound: 0
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "weight"
+  shape: {
+    dims: 300
+    dims: 3
+    dims: 3
+    dims: 100
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 10
+    lower_bound: -1
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "bias"
+  shape: {
+    dims: 300
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 23
+    upper_bound: 5
+    lower_bound: -6
+    distribution: UNIFORM
+  }
+}
+output {
+  id: "output"
+  shape: {
+    dims: 1
+    dims: 16
+    dims: 16
+    dims: 300
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+}
+dcn_param: {
+  dimnb: 4
+  pad: 1
+  pad: 1
+  pad: 1
+  pad: 1
+  stride: 1
+  stride: 1
+  dilation: 1
+  dilation: 1
+  deformable_group: 2
+  conv_group: 3
+  im2col_step: 1
+  compute_type: 2 
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}