From 658fe6e5a1db5cc69468b5a449ec76ddfbe9e962 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-7-65.us-west-2.compute.internal>
Date: Fri, 3 Dec 2021 13:57:28 +0000
Subject: [PATCH 01/22] add reduce operation

---
 include/mxnet/c_api.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 4210a9fa63d4..cfefcee1ddee 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -515,6 +515,15 @@ MXNET_DLL int MXSetNumOMPThreads(int thread_num);
  * \param bulk_size new bulk_size
  * \param prev_bulk_size previous bulk_size
  */
+
+ /*!
+ * \brief Get the compute capability of a given GPU.
+ * \param dev the GPU number to query
+ * \param out pointer to integer that will hold the compute capability of the queried GPU.
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXGetGPUSMArch(int dev, int* out);
+
 MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size);
 
 /*!
@@ -534,6 +543,22 @@ MXNET_DLL int MXGetGPUCount(int* out);
  */
 MXNET_DLL int MXGetGPUMemoryInformation(int dev, int* free_mem, int* total_mem);
 
+/*!
+ * \brief Get the size of the NCCL unique id (in bytes).
+ * \param size pointer to integer that will hold the NCCL unique id size.
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXNCCLGetUniqueIdSize(int* size);
+
+/*!
+ * \brief Get the NCCL unique id.
+ * \param out pointer to an array that will hold the NCCL unique id. It has to be at least of the
+ *            size returned by MXNCCLGetUniqueIdSize.
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXNCCLGetUniqueId(void* out);
+
+
 /*!
  * \brief get the free and total available memory on a GPU
  * \param dev the GPU number to query

From eeff638019a801af8e64a74b2ec6930cff11d2eb Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-7-65.us-west-2.compute.internal>
Date: Fri, 3 Dec 2021 13:57:42 +0000
Subject: [PATCH 02/22] add reduce operation

---
 src/c_api/c_api.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 2fb883f00997..860c9dd4c420 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -62,6 +62,13 @@
 #include "miniz.h"
 #include "nnvm/pass_functions.h"
 
+#if MXNET_USE_CUDA
+#include "../common/cuda/utils.h"
+#endif
+#if MXNET_USE_NCCL
+#include <nccl.h>
+#endif
+
 // FTZ only applies to SSE and AVX instructions.
 #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \
     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)

From 984d9e1584c0ab30205d6a46ad278b764c0406f1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-7-65.us-west-2.compute.internal>
Date: Fri, 3 Dec 2021 13:58:37 +0000
Subject: [PATCH 03/22] add reduce operation

---
 src/operator/contrib/nn/reduce_op-inl.h | 97 +++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 src/operator/contrib/nn/reduce_op-inl.h

diff --git a/src/operator/contrib/nn/reduce_op-inl.h b/src/operator/contrib/nn/reduce_op-inl.h
new file mode 100644
index 000000000000..771a4fa6ecce
--- /dev/null
+++ b/src/operator/contrib/nn/reduce_op-inl.h
@@ -0,0 +1,97 @@
+#ifndef MXNET_OPERATOR_CONTRIB_NCCLREDUCE_H_
+#define MXNET_OPERATOR_CONTRIB_NCCLREDUCE_H_
+
+#include <mxnet/base.h>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../elemwise_op_common.h"
+#include "../../tensor/init_op.h"
+
+#if MXNET_USE_NCCL
+#include <nccl.h>
+#include <unordered_map>
+#include <memory>
+
+namespace mxnet {
+namespace op {
+
+struct NCCLReduceParam : public dmlc::Parameter<NCCLReduceParam> {
+  int32_t num_gpus;
+  int32_t root_rank;
+  int32_t rank;
+  uintptr_t nccl_unique_id;
+
+  DMLC_DECLARE_PARAMETER(NCCLReduceParam) {
+    DMLC_DECLARE_FIELD(num_gpus)
+      .set_default(1)
+      .describe("Number of all gpus.");
+    DMLC_DECLARE_FIELD(root_rank)
+      .set_default(0)
+      .describe("root rank of reduce operation");
+    DMLC_DECLARE_FIELD(rank)
+      .set_default(0)
+      .describe("rank of current process");
+    DMLC_DECLARE_FIELD(nccl_unique_id)
+      .describe("NCCL unique ID");
+  }
+};
+
+template <int req>
+struct ncclreduce_backward {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType* in_grad,
+                                  const DType* out_grad) {
+    KERNEL_ASSIGN(in_grad[i], req, out_grad[i] * 1);
+  }
+};
+
+template <typename xpu>
+void NCCLReduceBackward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu>* s     = ctx.get_stream<xpu>();
+  const TBlob& out_grad       = inputs[0];
+  const TBlob& in_grad        = outputs[0];
+  const NCCLReduceParam& param = nnvm::get<NCCLReduceParam>(attrs.parsed);
+  using namespace mxnet_op;
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<ncclreduce_backward<req_type>, xpu>::Launch(s,
+                                                        in_grad.Size(),
+                                                        in_grad.dptr<DType>(),
+                                                        out_grad.dptr<DType>());
+    });
+  });
+}
+
+
+
+
+
+class NCCLCommContainer {
+ public:
+  struct Param {
+    int num_gpus;
+    int rank;
+    uintptr_t nccl_unique_id;
+  };
+  static inline std::unordered_map<int, std::unique_ptr<ncclComm_t>> comm_map;
+
+  static void Init(const Param& param);
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#else
+static_assert(false, "You need to compile with NCCL support to use reduce operation!");
+#endif  // MXNET_USE_NCCL
+
+#endif  // MXNET_OPERATOR_CONTRIB_SPATIAL_PARALLEL_SUPPORT_H_
\ No newline at end of file

From 5ee19827d74949881ef4f44380acc89b65ada256 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-7-65.us-west-2.compute.internal>
Date: Fri, 3 Dec 2021 13:58:56 +0000
Subject: [PATCH 04/22] add reduce operation

---
 src/operator/contrib/nn/reduce_op.cc | 89 ++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 src/operator/contrib/nn/reduce_op.cc

diff --git a/src/operator/contrib/nn/reduce_op.cc b/src/operator/contrib/nn/reduce_op.cc
new file mode 100644
index 000000000000..b21c290a505e
--- /dev/null
+++ b/src/operator/contrib/nn/reduce_op.cc
@@ -0,0 +1,89 @@
+#include "reduce_op-inl.h"
+#include <mxnet/base.h>
+#include <mxnet/storage.h>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+#include "../../operator_common.h"
+#include "../../elemwise_op_common.h"
+
+
+namespace mxnet {
+namespace op {
+
+void NCCLCommContainer::Init(const NCCLCommContainer::Param& param) {
+  std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
+  if (NCCLCommContainer::comm_map.count(param.num_gpus) == 0) {
+    auto [it, inserted] = NCCLCommContainer::comm_map.emplace(param.num_gpus, // NOLINT(*)
+        std::make_unique<ncclComm_t>());
+    CHECK(inserted) << "Could not insert new NCCL communicator!";
+    ncclComm_t* comm = it->second.get();
+    ncclUniqueId id = *(reinterpret_cast<ncclUniqueId*>(
+          reinterpret_cast<void*>(param.nccl_unique_id)));
+    auto result = ncclCommInitRank(comm, param.num_gpus, id, param.rank);
+    CHECK_EQ(result, ncclSuccess) << "ncclCommInitRank failed!";
+  }
+}
+
+bool NCCLReduceShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+}
+
+inline bool NCCLReduceType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return out_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(NCCLReduceParam);
+
+NNVM_REGISTER_OP(_contrib_NCCLReduce)
+.describe(R"code(Reduce operation
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<NCCLReduceParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", NCCLReduceShape)
+.set_attr<nnvm::FInferType>("FInferType", NCCLReduceType)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_NCCLReduce"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    const NCCLReduceParam& param = nnvm::get<NCCLReduceParam>(attrs.parsed);
+    if (param.num_gpus == 1) {
+      return std::vector<bool>{true};
+    } else {
+      return std::vector<bool>{false};
+    }
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(NCCLReduceParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_NCCLReduce)
+.set_attr_parser(ParamParser<NCCLReduceParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", NCCLReduceBackward<cpu>);
+}
+}
\ No newline at end of file

From ed3687e8a3dd33290724187eea456cf507dae896 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-7-65.us-west-2.compute.internal>
Date: Fri, 3 Dec 2021 13:59:12 +0000
Subject: [PATCH 05/22] add reduce operation

---
 src/operator/contrib/nn/reduce_op.cu | 60 ++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 src/operator/contrib/nn/reduce_op.cu

diff --git a/src/operator/contrib/nn/reduce_op.cu b/src/operator/contrib/nn/reduce_op.cu
new file mode 100644
index 000000000000..6c0a1a92f2fa
--- /dev/null
+++ b/src/operator/contrib/nn/reduce_op.cu
@@ -0,0 +1,60 @@
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file spatial_parallel_support.cu
+ * \brief Support operators for spatial parallelism
+ * \author Przemyslaw Tredak
+*/
+
+#include "reduce_op-inl.h"
+#include <mxnet/base.h>
+#include <mxnet/storage.h>
+#include <mutex>
+#include <vector>
+#include "../../operator_common.h"
+#include "../../../common/utils.h"
+#include "../../tensor/elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+
+
+void NCCLReduceCompute(const nnvm::NodeAttrs& attrs,
+                                     const OpContext& ctx,
+                                     const std::vector<TBlob>& inputs,
+                                     const std::vector<OpReqType>& req,
+                                     const std::vector<TBlob>& outputs) {
+  const NCCLReduceParam& param = nnvm::get<NCCLReduceParam>(attrs.parsed);
+  if (req[0] == OpReqType::kNullOp) return;
+  if (param.num_gpus == 1 && req[0] == OpReqType::kWriteInplace) return;
+  NCCLCommContainer::Param p = {param.num_gpus,
+                                param.rank,
+                                param.nccl_unique_id};
+  NCCLCommContainer::Init(p);
+
+  std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
+  ncclComm_t comm = *(NCCLCommContainer::comm_map.at(param.num_gpus));
+  const index_t size = inputs[0].shape_.Size() *
+                       common::mshadow_type_info(inputs[0].type_flag_).size;
+  if (req[0] != OpReqType::kAddTo) {
+
+    ncclResult_t result = ncclReduce(inputs[0].dptr_,
+                                        outputs[0].dptr_,
+                                        size, ncclInt8, ncclSum,  param.root_rank,
+                                        comm,
+                                        mshadow::Stream<gpu>::GetStream(ctx.get_stream<gpu>()));
+
+
+    CHECK_EQ(result, ncclSuccess) << "NCCL Reduce failed!";
+  } else {
+    LOG(FATAL) << "kAddTo not supported yet!";
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_NCCLReduce)
+.set_attr<FCompute>("FCompute<gpu>", NCCLReduceCompute);
+
+NNVM_REGISTER_OP(_backward_NCCLReduce)
+.set_attr<FCompute>("FCompute<gpu>", NCCLReduceBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
\ No newline at end of file

From eb26c0edb6ac8f693e10e0b351ba1ae1c82a51d5 Mon Sep 17 00:00:00 2001
From: "xinyual@amazon.com" <ubuntu@ip-10-21-7-65.us-west-2.compute.internal>
Date: Tue, 14 Dec 2021 10:13:46 +0000
Subject: [PATCH 06/22] pass way

---
 example/extensions/lib_pass/pass_lib.cc  | 54 ++++++++++++++++++++++++
 example/extensions/lib_pass/test_pass.py | 37 ++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc
index f441877fcad7..31b362addb8d 100644
--- a/example/extensions/lib_pass/pass_lib.cc
+++ b/example/extensions/lib_pass/pass_lib.cc
@@ -25,6 +25,7 @@
 #include <cmath>
 #include <iostream>
 #include <algorithm>
+#include <string>
 #include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
@@ -35,7 +36,60 @@ MXReturnValue myPass(mxnet::ext::Graph* g,
   for (auto kv : options) {
     std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
   }
+  //g->addNode("myConv","Convolution");
+  mxnet::ext::Node *copy_old_layer = g->getNode(g->size()-2);
+  mxnet::ext::Node *old_layer = g->getNode(g->size()-1);
+  std::cout<<old_layer->name<<std::endl;
+
+  mxnet::ext::Node *new_weight = g->addNode("myweight3","null");
+  new_weight->alloc_arg({3,2}, MXContext::CPU(0),kFloat32);
+
+  mxnet::ext::Node *new_bias = g->addNode("mybias3","null");
+  new_bias->alloc_arg({3,}, MXContext::CPU(0),kFloat32);
+
+
+
+
+  //new_bias->alloc_aux({2}, MXContext::CPU(0), kInt32);
+
+
+  auto attrs = copy_old_layer->attrs;
+  mxnet::ext::Node *new_layer = g->addNode("mylinaer","FullyConnected");
+  auto new_attrs = &new_layer->attrs;
+  for (auto it = attrs.begin(); it!=attrs.end(); it ++ )
+  {
+
+    std::cout<<it->first<<" : "<<it->second<<std::endl;
+    if (it->first!="num_hidden"){
+    new_attrs->insert({{it->first,it->second}});}
+  }
+  new_attrs->insert({{"num_hidden","3"}});
+
+  std::cout<<"current:"<<std::endl;
+
+
+
+  for (auto it = new_attrs->begin(); it!=new_attrs->end(); it ++ )
+  {
+    std::cout<<it->first<<" : "<<it->second<<std::endl;
+  }
+
+  new_layer->inputs.push_back({old_layer, 0});
+  new_layer->inputs.push_back({new_weight, 0});
+  new_layer->inputs.push_back({new_bias, 0});
+  old_layer->outputs.push_back({new_layer, 0});
+  new_weight->outputs.push_back({new_layer, 1});
+  new_bias->outputs.push_back({new_layer, 2});
+  auto it = g->outputs.begin();
+  g->outputs.erase(it);
+  mxnet::ext::NodeEntry new_entry;
+  new_entry.node = new_layer;
+  new_entry.entry = 0;
+  g->outputs.push_back(new_entry);
+
   g->print();
+
+
   return MX_SUCCESS;
 }
 
diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py
index ab89f9566ebe..7e65eb60c75c 100644
--- a/example/extensions/lib_pass/test_pass.py
+++ b/example/extensions/lib_pass/test_pass.py
@@ -27,6 +27,8 @@
 import mxnet as mx
 from mxnet.gluon import nn
 from mxnet import nd
+from mxnet import np, npx
+from gluonnlp.layers import get_activation
 from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle
 
 # load library
@@ -47,7 +49,42 @@
 d = mx.sym.exp(c)
 sym = mx.sym.log(d)
 
+
+class Easynet(nn.HybridBlock):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Dense(in_units=2, units=2, flatten=False)
+        #self.l2 = nn.Dense(in_units=2, units=2, flatten=False)
+        self.act1 = get_activation('relu')
+
+        #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False))
+        #self.seq.add(get_activation('relu'))
+        #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False))
+        #self.seq.register_op_hook(mon_callback,  monitor_all=True)
+        #self.l1.register_op_hook(mon_callback,  monitor_all=True)
+
+
+    def forward(self, input):
+        input = self.l1(input)
+        input = self.act1(input)
+        return input
+
+
 def test_model(pass_name):
+    model = Easynet()
+    model.initialize()
+    model.hybridize()
+
+
+    print('try on model')
+    x = np.array([[1,2]])
+
+    model.optimize_for(x, backend = pass_name)
+
+    out = model(x)
+    model.export("my_model")
+    print(out.shape)
+    return
     args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))}
     # execute in MXNet
     print('-------------------------------')

From 61906ea0ed4056b788c87af2c2ebf2598580a481 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Thu, 16 Dec 2021 07:12:10 +0000
Subject: [PATCH 07/22] add nccl reduce inside mxnet

---
 include/mxnet/c_api.h                     |  8 ++++++
 include/mxnet/resource.h                  | 14 +++++++---
 python/mxnet/gluon/contrib/__init__.py    |  2 ++
 src/c_api/c_api.cc                        | 34 +++++++++++++++++++++++
 src/imperative/attach_op_resource_pass.cc |  6 ++++
 src/operator/contrib/nn/reduce_op.cc      |  2 +-
 src/operator/contrib/nn/reduce_op.cu      |  3 +-
 7 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index cfefcee1ddee..4120cecf36ab 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -533,6 +533,14 @@ MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size);
  */
 MXNET_DLL int MXGetGPUCount(int* out);
 
+/*!
+ * \brief Get the compute capability of a given GPU.
+ * \param dev the GPU number to query
+ * \param out pointer to integer that will hold the compute capability of the queried GPU.
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXGetGPUSMArch(int dev, int* out);
+
 /*!
  * \brief get the free and total available memory on a GPU
  *  Note: Deprecated, use MXGetGPUMemoryInformation64 instead.
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index b856002cb76f..038d7c20fcb2 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -39,16 +39,22 @@ struct ResourceRequest {
   /*! \brief Resource type, indicating what the pointer type is */
   enum Type {
     /*! \brief mshadow::Random<xpu> object */
-    kRandom,
+    kRandom = 0,
     /*! \brief A dynamic temp space that can be arbitrary size */
-    kTempSpace,
+    kTempSpace = 1,
     /*! \brief common::RandGenerator<xpu> object, which can be used in GPU kernel functions */
-    kParallelRandom
+    kParallelRandom = 2
 #if MXNET_USE_CUDNN == 1
     ,
     /*! \brief cudnnDropoutDescriptor_t object for GPU dropout kernel functions */
-    kCuDNNDropoutDesc
+    kCuDNNDropoutDesc = 3
 #endif  // MXNET_USE_CUDNN == 1
+#if MXNET_USE_CUDA
+    ,
+    /*! \brief Resource indicating the usage of multi GPU communication, used to prevent
+     *         multiple ops of doing it at the same time */
+    kMultiGPUComm = 4
+#endif  // MXNET_USE_CUDA == 1
   };
   /*! \brief type of resources */
   Type type;
diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py
index 9b9f0ce8d696..8d421a137336 100644
--- a/python/mxnet/gluon/contrib/__init__.py
+++ b/python/mxnet/gluon/contrib/__init__.py
@@ -21,3 +21,5 @@
 from . import data
 
 from . import estimator
+
+from . import nn
\ No newline at end of file
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 860c9dd4c420..ea8cec633171 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1943,6 +1943,16 @@ int MXGetGPUCount(int* out) {
   API_END();
 }
 
+int MXGetGPUSMArch(int dev, int* out) {
+  API_BEGIN();
+#if MXNET_USE_CUDA == 1
+  *out = SMArch(dev);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 to query CUDA device properties.";
+#endif
+  API_END();
+}
+
 // Deprecated: use MXGetGPUMemoryInformation64() instead.
 int MXGetGPUMemoryInformation(int dev, int* free_mem, int* total_mem) {
   API_BEGIN();
@@ -1960,6 +1970,30 @@ int MXGetGPUMemoryInformation64(int dev, uint64_t* free_mem, uint64_t* total_mem
   API_END();
 }
 
+int MXNCCLGetUniqueIdSize(int* size) {
+  API_BEGIN();
+#if MXNET_USE_CUDA && MXNET_USE_NCCL
+  *size = sizeof(ncclUniqueId);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NCCL=1 to have NCCL support.";
+#endif
+  API_END();
+}
+
+int MXNCCLGetUniqueId(void* out) {
+  API_BEGIN();
+#if MXNET_USE_CUDA && MXNET_USE_NCCL
+  auto ret = ncclGetUniqueId(reinterpret_cast<ncclUniqueId*>(out));
+  if (ret != ncclSuccess) {
+    LOG(FATAL) << "Failed to get the NCCL unique id";
+  }
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NCCL=1 to have NCCL support.";
+#endif
+  API_END();
+}
+
+
 int MXGetVersion(int* out) {
   API_BEGIN();
   *out = static_cast<int>(MXNET_VERSION);
diff --git a/src/imperative/attach_op_resource_pass.cc b/src/imperative/attach_op_resource_pass.cc
index 17d6d7a41dc3..5b8e3ebf4649 100644
--- a/src/imperative/attach_op_resource_pass.cc
+++ b/src/imperative/attach_op_resource_pass.cc
@@ -83,6 +83,12 @@ void AttachOpResources(const Graph& g,
             break;
           }
 #endif  // MXNET_USE_CUDNN == 1
+#if MXNET_USE_CUDA == 1
+          case ResourceRequest::kMultiGPUComm: {
+            requested.push_back(ResourceManager::Get()->Request(ctx, req));
+            break;
+          }
+#endif  // MXNET_USE_NCCL == 1
           default:
             LOG(FATAL) << "resource type " << req.type << " is not yet supported";
         }
diff --git a/src/operator/contrib/nn/reduce_op.cc b/src/operator/contrib/nn/reduce_op.cc
index b21c290a505e..35f840affed8 100644
--- a/src/operator/contrib/nn/reduce_op.cc
+++ b/src/operator/contrib/nn/reduce_op.cc
@@ -62,7 +62,7 @@ NNVM_REGISTER_OP(_contrib_NCCLReduce)
   })
 .set_attr<mxnet::FInferShape>("FInferShape", NCCLReduceShape)
 .set_attr<nnvm::FInferType>("FInferType", NCCLReduceType)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_NCCLReduce"})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_NCCLReduce"})
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
diff --git a/src/operator/contrib/nn/reduce_op.cu b/src/operator/contrib/nn/reduce_op.cu
index 6c0a1a92f2fa..2b7d9ff0c935 100644
--- a/src/operator/contrib/nn/reduce_op.cu
+++ b/src/operator/contrib/nn/reduce_op.cu
@@ -36,10 +36,9 @@ void NCCLReduceCompute(const nnvm::NodeAttrs& attrs,
   const index_t size = inputs[0].shape_.Size() *
                        common::mshadow_type_info(inputs[0].type_flag_).size;
   if (req[0] != OpReqType::kAddTo) {
-
     ncclResult_t result = ncclReduce(inputs[0].dptr_,
                                         outputs[0].dptr_,
-                                        size, ncclInt8, ncclSum,  param.root_rank,
+                                        size, ncclFloat32, ncclAvg,  param.root_rank,
                                         comm,
                                         mshadow::Stream<gpu>::GetStream(ctx.get_stream<gpu>()));
 

From 33fc23d0fe47721d29fc630720b4a185194e58a4 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Tue, 21 Dec 2021 12:36:59 +0000
Subject: [PATCH 08/22] add reduce into graph

---
 example/extensions/lib_pass/pass_lib.cc | 92 ++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc
index 31b362addb8d..7170199ba380 100644
--- a/example/extensions/lib_pass/pass_lib.cc
+++ b/example/extensions/lib_pass/pass_lib.cc
@@ -31,11 +31,29 @@
 using namespace mxnet::ext;
 
 /* \brief a basic pass that prints out the options and the graph */
+/*
 MXReturnValue myPass(mxnet::ext::Graph* g,
                      const std::unordered_map<std::string, std::string>& options) {
+  int cur_rank = -1;
+  std::string nccl_unique_id = "asd";
   for (auto kv : options) {
-    std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
+    std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;}
+
+  std::cout<<" print options" << std::endl;
+  size_t length = g->size();
+  mxnet::ext::Node *tmp;
+  for (int i = 0;i < length; i += 1)
+  {
+    tmp = g->getNode(i);
+    std::cout<<tmp->name<<std::endl;
+    auto attrs = tmp->attrs;
+    for (auto it = attrs.begin(); it!=attrs.end(); it ++ )
+  {
+    std::cout<<tmp->name<<":attrs:"<<it->first<<" : "<<it->second<<std::endl;
+  }
   }
+  //g->addNode("myConv","_contrib_NCCLReduce");
+  /*
   //g->addNode("myConv","Convolution");
   mxnet::ext::Node *copy_old_layer = g->getNode(g->size()-2);
   mxnet::ext::Node *old_layer = g->getNode(g->size()-1);
@@ -92,6 +110,78 @@ MXReturnValue myPass(mxnet::ext::Graph* g,
 
   return MX_SUCCESS;
 }
+*/
+
+
+MXReturnValue myPass(mxnet::ext::Graph* g,
+                     const std::unordered_map<std::string, std::string>& options) {
+  std::string cur_rank = "";
+  std::string nccl_unique_id = "";
+  std::string num_gpus = "";
+  for (auto kv : options) {
+    std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
+    if (kv.first == "rank")
+    {
+        cur_rank = kv.second.c_str();
+    }
+    if (kv.first == "nccl_unique_id")
+        nccl_unique_id = kv.second.c_str();
+    if (kv.first == "num_gpus")
+        num_gpus = kv.second.c_str();
+    }
+  std::cout<<"nccl:"<<nccl_unique_id<<std::endl;
+
+  size_t length = g->size();
+  mxnet::ext::Node *tmp;
+  std::string root_rank;
+  mxnet::ext::Node *target_node;
+  int index = 0;
+  for (int i = 0;i < length; i += 1)
+  {
+    target_node = g->getNode(i);
+    std::cout<<"deal with:" << target_node->name<<std::endl;
+    auto it = options.find(target_node->name);
+    if (it == options.end()) {continue;} // req_grad == null
+    root_rank = it->second;
+    mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + std::to_string(index),"_contrib_NCCLReduce");
+    index += 1;
+    auto new_attrs = &new_reduce->attrs;
+    auto old_attrs = target_node->attrs;
+    for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++)
+    {
+        if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__")
+        {
+            new_attrs ->insert({{it->first, it->second}});
+        }
+    }
+    new_attrs->insert({{"nccl_unique_id", nccl_unique_id}});
+    new_attrs->insert({{"num_gpus", num_gpus}});
+    new_attrs->insert({{"rank", cur_rank}});
+    new_attrs->insert({{"root_rank", root_rank}});
+
+  for (int i=0;i<target_node->outputs.size(); i++)
+  {
+     new_reduce->outputs.push_back(target_node->outputs[i]);
+     mxnet::ext::Node *output_node = target_node->outputs[i].node;
+     int index = target_node->outputs[i].entry;
+     //std::cout<<"try change:"<<output_node->name<<":"<<output_node->inputs.size()<<std::endl;
+     output_node->inputs[index].node = new_reduce;
+  }
+  for (int i=0;i<target_node->outputs.size(); i++)
+  {
+     target_node->outputs.pop_back();
+  }
+  target_node->outputs.push_back({new_reduce, 0});
+  new_reduce->inputs.push_back({target_node, 0});
+
+  }
+  g->print();
+
+
+  return MX_SUCCESS;
+}
+
+
 
 REGISTER_PASS(myPass).setBody(myPass);
 

From 996a4e766009446e6eaedee9441b19e6d7ef6854 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Tue, 21 Dec 2021 13:03:26 +0000
Subject: [PATCH 09/22] reverse operation

---
 src/operator/contrib/nn/reduce_op-inl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/contrib/nn/reduce_op-inl.h b/src/operator/contrib/nn/reduce_op-inl.h
index 771a4fa6ecce..8b08d7d6994f 100644
--- a/src/operator/contrib/nn/reduce_op-inl.h
+++ b/src/operator/contrib/nn/reduce_op-inl.h
@@ -38,7 +38,7 @@ struct NCCLReduceParam : public dmlc::Parameter<NCCLReduceParam> {
 };
 
 template <int req>
-struct ncclreduce_backward {
+struct ncclreduce_compute {
   template <typename DType>
   MSHADOW_XINLINE static void Map(int i,
                                   DType* in_grad,
@@ -48,7 +48,7 @@ struct ncclreduce_backward {
 };
 
 template <typename xpu>
-void NCCLReduceBackward(const nnvm::NodeAttrs& attrs,
+void NCCLReduceCompute(const nnvm::NodeAttrs& attrs,
                          const OpContext& ctx,
                          const std::vector<TBlob>& inputs,
                          const std::vector<OpReqType>& req,
@@ -63,7 +63,7 @@ void NCCLReduceBackward(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-      Kernel<ncclreduce_backward<req_type>, xpu>::Launch(s,
+      Kernel<ncclreduce_compute<req_type>, xpu>::Launch(s,
                                                         in_grad.Size(),
                                                         in_grad.dptr<DType>(),
                                                         out_grad.dptr<DType>());

From 547763357be414541251300389e3efa3c706bda9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Tue, 21 Dec 2021 13:03:36 +0000
Subject: [PATCH 10/22] reverse operation

---
 src/operator/contrib/nn/reduce_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/contrib/nn/reduce_op.cc b/src/operator/contrib/nn/reduce_op.cc
index 35f840affed8..e00245b06ed1 100644
--- a/src/operator/contrib/nn/reduce_op.cc
+++ b/src/operator/contrib/nn/reduce_op.cc
@@ -56,6 +56,7 @@ NNVM_REGISTER_OP(_contrib_NCCLReduce)
 .set_attr_parser(ParamParser<NCCLReduceParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
+.set_attr<FCompute>("FCompute<cpu>", NCCLReduceCompute<cpu>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"data"};
@@ -83,7 +84,6 @@ NNVM_REGISTER_OP(_backward_NCCLReduce)
 .set_attr_parser(ParamParser<NCCLReduceParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", NCCLReduceBackward<cpu>);
+.set_attr<nnvm::TIsBackward>("TIsBackward", true);
 }
 }
\ No newline at end of file

From 0fc7f6a1c38959f8ca9d6a3e2737c4d52a0a66c1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Tue, 21 Dec 2021 13:03:40 +0000
Subject: [PATCH 11/22] reverse operation

---
 src/operator/contrib/nn/reduce_op.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/contrib/nn/reduce_op.cu b/src/operator/contrib/nn/reduce_op.cu
index 2b7d9ff0c935..655023c6c964 100644
--- a/src/operator/contrib/nn/reduce_op.cu
+++ b/src/operator/contrib/nn/reduce_op.cu
@@ -18,7 +18,7 @@ namespace mxnet {
 namespace op {
 
 
-void NCCLReduceCompute(const nnvm::NodeAttrs& attrs,
+void NCCLReduceBackward(const nnvm::NodeAttrs& attrs,
                                      const OpContext& ctx,
                                      const std::vector<TBlob>& inputs,
                                      const std::vector<OpReqType>& req,
@@ -50,10 +50,10 @@ void NCCLReduceCompute(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_NCCLReduce)
-.set_attr<FCompute>("FCompute<gpu>", NCCLReduceCompute);
+.set_attr<FCompute>("FCompute<gpu>", NCCLReduceCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_NCCLReduce)
-.set_attr<FCompute>("FCompute<gpu>", NCCLReduceBackward<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", NCCLReduceBackward);
 
 }  // namespace op
 }  // namespace mxnet
\ No newline at end of file

From f790512d6d47f7a39f3500ec72bd400b91011131 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Mon, 17 Jan 2022 11:33:04 +0000
Subject: [PATCH 12/22] print version

---
 cpp-package/include/mxnet-cpp/executor.h |   9 +-
 example/extensions/lib_pass/pass_lib.cc  |   2 +-
 example/extensions/lib_pass/test_pass.py | 218 ++++++++++++++++++++---
 include/mxnet/c_api.h                    |   5 +-
 include/mxnet/imperative.h               |   3 +-
 python/mxnet/ndarray/ndarray.py          |  17 +-
 src/c_api/c_api_ndarray.cc               |  20 ++-
 src/imperative/cached_op.cc              |  64 +++++--
 src/imperative/cached_op.h               |  12 +-
 src/imperative/imperative.cc             |  20 ++-
 src/imperative/imperative_utils.cc       |  20 ++-
 src/imperative/imperative_utils.h        |   3 +-
 src/imperative/naive_cached_op.h         |   3 +-
 src/operator/subgraph_op_common.cc       |   2 +-
 14 files changed, 333 insertions(+), 65 deletions(-)

diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index fff559b79df3..bb602fc282c9 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -132,7 +132,9 @@ class Executor {
                                       1,
                                       nullptr,
                                       nullptr),
-                 0);
+                 0,
+                 nullptr,
+                 nullptr);
       } else {
         CHECK_EQ(MXAutogradBackwardEx(out_handles.size(),
                                       out_handles.data(),
@@ -144,7 +146,10 @@ class Executor {
                                       1,
                                       nullptr,
                                       nullptr),
-                 0);
+                 0,
+                 0,
+                 nullptr,
+                 nullptr);
       }
       grad_arrays.clear();
       grad_arrays.reserve(arg_arrays.size());
diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc
index 7170199ba380..df7ac8fa079f 100644
--- a/example/extensions/lib_pass/pass_lib.cc
+++ b/example/extensions/lib_pass/pass_lib.cc
@@ -143,7 +143,7 @@ MXReturnValue myPass(mxnet::ext::Graph* g,
     auto it = options.find(target_node->name);
     if (it == options.end()) {continue;} // req_grad == null
     root_rank = it->second;
-    mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + std::to_string(index),"_contrib_NCCLReduce");
+    mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce");
     index += 1;
     auto new_attrs = &new_reduce->attrs;
     auto old_attrs = target_node->attrs;
diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py
index 7e65eb60c75c..df115eee7853 100644
--- a/example/extensions/lib_pass/test_pass.py
+++ b/example/extensions/lib_pass/test_pass.py
@@ -27,8 +27,15 @@
 import mxnet as mx
 from mxnet.gluon import nn
 from mxnet import nd
+import numpy as np
+from mxnet.lr_scheduler import PolyScheduler
 from mxnet import np, npx
-from gluonnlp.layers import get_activation
+from pos_trainer import POS_Trainer
+try:
+    import horovod.mxnet as hvd
+except ImportError:
+    pass
+#from gluonnlp.layers import get_activation
 from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle
 
 # load library
@@ -39,6 +46,40 @@
     path = os.path.abspath('libpass_lib.dll')
     mx.library.load(path)
 
+
+class _NCCLReduceHelper(object):
+    _init = False
+    nccl_id = None
+    num_gpus = None
+    rank = None
+
+    @staticmethod
+    def init(num_gpus, root_rank):
+        """Communicate the NCCL unique id"""
+        cls = _NCCLReduceHelper
+        if not cls._init:
+            cls._init = True
+            import ctypes
+            try:
+                from mpi4py import MPI
+            except:
+                raise ImportError("Spatial parallel modules require mpi4py package.")
+            import numpy as np
+            nccl_id_size = ctypes.c_int()
+            check_call(_LIB.MXNCCLGetUniqueIdSize(ctypes.byref(nccl_id_size)))
+            nccl_id_size = nccl_id_size.value
+            cls.nccl_id = np.zeros(nccl_id_size, np.byte)
+            check_call(_LIB.MXNCCLGetUniqueId(
+                cls.nccl_id.ctypes.data_as(ctypes.c_void_p)))
+            global_comm = MPI.COMM_WORLD
+            rank = global_comm.rank
+            color = rank / num_gpus
+            comm = global_comm.Split(color, rank)
+            comm.Bcast([cls.nccl_id, nccl_id_size, MPI.BYTE], root=0)
+            cls.num_gpus = num_gpus
+            cls.rank = rank % num_gpus
+            cls.root_rank = root_rank % num_gpus
+        assert num_gpus == cls.num_gpus
 ###############################################
 # Test with not consuming params
 ###############################################
@@ -55,7 +96,7 @@ def __init__(self):
         super().__init__()
         self.l1 = nn.Dense(in_units=2, units=2, flatten=False)
         #self.l2 = nn.Dense(in_units=2, units=2, flatten=False)
-        self.act1 = get_activation('relu')
+        #self.act1 = get_activation('relu')
 
         #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False))
         #self.seq.add(get_activation('relu'))
@@ -66,42 +107,167 @@ def __init__(self):
 
     def forward(self, input):
         input = self.l1(input)
-        input = self.act1(input)
+        #print(input)
+        #input = self.act1(input)
         return input
 
 
 def test_model(pass_name):
+    from mxnet import gluon
+    from mxnet.gluon import Block, nn, HybridBlock
+    from mxnet import init
+
+
+
+    hvd.init()
+    rank = hvd.rank()
+    size = hvd.size()
+    ctx = mx.gpu(rank)
+
+    np.random.seed(1234 + 10 * rank)
+    mx.random.seed(1234 + 10 * rank)
+
+    num_gpus = size
+    root_rank = 0
+    helper = _NCCLReduceHelper
+    helper.init(num_gpus, 0)
+    kwargs = {
+        'num_gpus': num_gpus,
+        'rank': helper.rank,
+        'root_rank': helper.root_rank,
+        'nccl_unique_id': helper.nccl_id.ctypes.data}
+
+    options = {"num_gpus":num_gpus, "rank": rank, "nccl_unique_id": helper.nccl_id.ctypes.data}
+
     model = Easynet()
-    model.initialize()
+
+    if rank == 0:
+        model.l1.weight.initialize(init=init.One(), ctx=ctx)
+        model.l1.bias.initialize(init=init.One(), ctx=ctx)
+    else:
+        model.l1.weight.initialize(init = init.Zero(), ctx = ctx)
+        model.l1.bias.initialize(init=init.Zero(), ctx=ctx)
     model.hybridize()
+    #param_dict = classify_net.collect_params()
+    #params = [p for p in param_dict.values() if p.grad_req != 'null']
+    params = model.collect_params()
+    index = 0
+    dic = {}
+    lr_scheduler = PolyScheduler(max_update=1,
+                                 base_lr=1e-3,
+                                 warmup_begin_lr=0.0,
+                                 pwr=1,
+                                 final_lr=0.0,
+                                 warmup_steps=0,
+                                 warmup_mode='linear')
+    optimizer_params = {'learning_rate': 1e-3,
+                        'wd': 1e-2,
+                        'lr_scheduler': lr_scheduler}
+    trainer = POS_Trainer(params, "adam", optimizer_params)
+    cor_rank = trainer.correspond_ranks()
 
+    for name in params:
+        type = name.split('.')[-1]
+        index = cor_rank[params[name]._uuid]
+        new_name = params[name]._uuid.replace('-', '_') + '_' + type
+        dic[new_name] = index
 
-    print('try on model')
-    x = np.array([[1,2]])
+    options.update(dic)
+    print(options)
+    #return
+    backward_options = {"partition_grad":True, "current_rank":rank}
+    for k in options:
+        if k in ['num_gpus', 'rank', 'nccl_unique_id']:
+            continue
+        backward_options['ncclreduce_' + k + '_backward'] = options[k]
+    print(backward_options)
+    x = np.ones((1,2), ctx = ctx)
+    label = np.ones((2, ), ctx = ctx) * rank
+    #print(options)
 
-    model.optimize_for(x, backend = pass_name)
+    loss_function = gluon.loss.L2Loss()
 
-    out = model(x)
-    model.export("my_model")
-    print(out.shape)
-    return
-    args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))}
-    # execute in MXNet
-    print('-------------------------------')
-    print('Testing regular MXNet execution')
-    inputs = [a,b]
+    model.optimize_for(x, backend = pass_name, **options)
+    #model.export("my_reduce_" + str(rank))
+    for i in range(1):
+        with mx.autograd.record():
+            out = model(x)
+            loss = loss_function(out, label).mean() / size
+            print("now call backward in python")
+            #print(loss.backward)
+            #print(loss)
+            #print(loss.backward)
+            loss.backward(backward_option = backward_options)
+    mx.npx.waitall()
+    mx.nd.waitall()
+    for name in params:
+        print(name, params[name].list_grad()[0])
+
+
+    #print(out)
+
+
+def test_reduce(pass_name):
+    from mxnet import gluon
+    from mxnet.gluon import Block, nn, HybridBlock
+    from mxnet import init
+
+    hvd.init()
+    rank = hvd.rank()
+    size = hvd.size()
+    ctx = mx.gpu(rank)
+
+    num_gpus = size
+    root_rank = 0
+    helper = _NCCLReduceHelper
+    helper.init(num_gpus, root_rank)
+    kwargs = {
+        'num_gpus': num_gpus,
+        'rank': helper.rank,
+        'root_rank': helper.root_rank,
+        'nccl_unique_id': helper.nccl_id.ctypes.data}
+
+    options = {"rank": rank, "nccl_unique_id":helper.nccl_id.ctypes.data}
+
+    a = mx.sym.var('a')
+    sym = mx.sym.contrib.NCCLReduce(a, **kwargs)
+
+    inputs = [a]
     sym_block = nn.SymbolBlock(sym, inputs)
-    sym_block.initialize()
-    out = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
+    sym_block.initialize(ctx = ctx)
+    p = mx.nd.ones((2, 3), ctx=ctx) * (rank + 2)
+    sym_block.optimize_for(p, backend = pass_name, **options)
+    sym_block.export("my_choice")
+    return
+
+    print(p)
+    out = sym_block(p)
+    mx.nd.waitall()
+    mx.npx.waitall()
     print(out)
+    return
+    class ReduceNet(nn.HybridBlock):
+        def __init__(self):
+            super().__init__()
+            self.reduce_layer = NCCLReduce(num_gpus=2, root_rank=0)
+        def forward(self, x):
+            x = self.reduce_layer(x)
+            return x
+
+    net = ReduceNet()
+    net.initialize(ctx=ctx)
+    net.hybridize()
+    data = mx.nd.ones((3,2), ctx = ctx)
+    net(data)
+    #data = mx.np.ones((1, 10), ctx=ctx) * rank
+    a = mx.sym.var('a')
+    out = net(a)
+    inputs = [a]
+    out_sym = nn.SymbolBlock(out, inputs)
+    out_sym.initialize(ctx = ctx)
+    true_out = out_sym(mx.nd.ones((3,2)))
+
+    print(true_out)
 
-    # Gluon optimize_for
-    print('-------------------------------')
-    print('Testing pass "%s" Gluon Hybridize with shapes/types without inference' % pass_name)
-    inputs = [a,b]
-    sym_block2 = nn.SymbolBlock(sym, inputs)
-    sym_block2.initialize()
-    sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend=pass_name)
-    sym_block2.export('modified')
 
 test_model('myPass')
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 4120cecf36ab..7b38d83c9b31 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1342,7 +1342,10 @@ MXNET_DLL int MXAutogradBackwardEx(uint32_t num_output,
                                    int create_graph,
                                    int is_train,
                                    NDArrayHandle** grad_handles,
-                                   int** grad_stypes);
+                                   int** grad_stypes,
+                                   const mx_uint num_options,
+                                   const char** keys,
+                                   const char** vals);
 /*
  * \brief get the graph constructed by autograd.
  * \param handle ndarray handle
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index e4e3f6a938d0..f49637dbf263 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -279,7 +279,8 @@ class Imperative {
                                  const std::vector<NDArray*>& variables,
                                  bool is_train,
                                  bool retain_graph,
-                                 bool create_graph);
+                                 bool create_graph,
+                                 const std::unordered_map<std::string, std::string> backward_options_map = {});
   /*! \brief Return the marked nonleaf nodes. */
   std::vector<nnvm::ObjectPtr> ListNonleafVariables(const nnvm::Symbol& sym) const;
   /*! \return AutogradRuntime singleton */
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index fd76789918a1..7ac9f13113cb 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -34,7 +34,7 @@
 from functools import reduce # pylint: disable=redefined-builtin
 import numpy as np
 from ..base import _LIB, numeric_types, integer_types
-from ..base import c_array, c_array_buf, c_handle_array, mx_real_t
+from ..base import c_array, c_array_buf, c_handle_array, mx_real_t, c_str_array
 from ..base import mx_uint, NDArrayHandle, check_call, mx_int, mx_int64
 from ..base import ctypes2buffer
 from ..dlpack import ndarray_to_dlpack_for_read, ndarray_to_dlpack_for_write
@@ -2924,7 +2924,7 @@ def detach(self):
         check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
         return _ndarray_cls(hdl)
 
-    def backward(self, out_grad=None, retain_graph=False, train_mode=True):
+    def backward(self, out_grad=None, retain_graph=False, train_mode=True, backward_option = {}):
         """Compute the gradients of this NDArray w.r.t variables.
 
         Parameters
@@ -2942,7 +2942,13 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True):
             ograd_handles = [NDArrayHandle(0)]
         else:
             ograd_handles = [out_grad.handle]
-
+        print("in ndarray.py in line 2945")
+        key_list = []
+        val_list = []
+        for key, val in backward_option.items():
+            key_list.append(key)
+            val_list.append(str(val))
+        print("in ndarray.py in line 2951", key_list, val_list)
         check_call(_LIB.MXAutogradBackwardEx(
             1, c_handle_array([self]),
             c_array(NDArrayHandle, ograd_handles),
@@ -2952,7 +2958,10 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True):
             ctypes.c_int(0),
             ctypes.c_int(train_mode),
             ctypes.c_void_p(0),
-            ctypes.c_void_p(0)))
+            ctypes.c_void_p(0),
+            mx_uint(len(key_list)),
+            c_str_array(key_list),
+            c_str_array(val_list)))
 
     def tostype(self, stype):
         """Return a copy of the array with chosen storage type.
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 2e9c0a373621..b436f7991ac8 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -362,6 +362,9 @@ int MXAutogradBackward(uint32_t num_output,
                               false,
                               true,
                               nullptr,
+                              nullptr,
+                              0,
+                              nullptr,
                               nullptr);
 }
 
@@ -374,10 +377,15 @@ int MXAutogradBackwardEx(uint32_t num_output,
                          int create_graph,
                          int is_train,
                          NDArrayHandle** grad_handles,
-                         int** grad_stypes) {
+                         int** grad_stypes,
+                         const mx_uint num_options,
+                         const char** keys,
+                         const char** vals) {
   MXAPIThreadLocalEntry<>* ret = MXAPIThreadLocalStore<>::Get();
   API_BEGIN();
-
+  std::unordered_map<std::string, std::string> backward_options_map;
+  for (mx_uint i = 0; i < num_options; ++i)
+    backward_options_map.emplace(keys[i], vals[i]);
   std::vector<NDArray*> outputs, ograds, variables;
   outputs.reserve(num_output);
   for (uint32_t i = 0; i < num_output; ++i) {
@@ -397,9 +405,13 @@ int MXAutogradBackwardEx(uint32_t num_output,
   for (uint32_t i = 0; i < num_variables; ++i) {
     variables.emplace_back(reinterpret_cast<NDArray*>(var_handles[i]));
   }
-
+  std::cout<<"in c_api_ndarray.cc in line 400" << std::endl;
+  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
+  {
+    std::cout<< "in line 408 in c_api_ndarray.cc:" << it->first << " : " << it->second<<std::endl;
+  }
   auto grads =
-      Imperative::Get()->Backward(outputs, ograds, variables, is_train, retain_graph, create_graph);
+      Imperative::Get()->Backward(outputs, ograds, variables, is_train, retain_graph, create_graph, backward_options_map);
   if (num_variables != 0) {
     ret->ret_handles.clear();
     ret->out_types.clear();
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 894ef09a1d16..4ad23b72a9c0 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -259,7 +259,8 @@ void SetBackwardInputEid(const std::vector<uint32_t>& bwd_in_dep,
 bool CachedOp::SetBackwardGraph(GraphInfo* info,
                                 const std::vector<OpReqType>& reqs,
                                 const std::vector<NDArray*>& inputs,
-                                bool detect_inplace_addto) {
+                                bool detect_inplace_addto,
+                                const std::unordered_map<std::string, std::string> backward_options_map) {
   using namespace nnvm;
   using namespace imperative;
   std::lock_guard<std::mutex> lock(mutex_);
@@ -279,7 +280,27 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info,
     g.attrs["context"] = std::make_shared<dmlc::any>(
         std::vector<Context>(g.indexed_graph().num_nodes(), default_ctx));
   }
+  //deal with delete
+  auto pg_option = backward_options_map.find("partition_grad");
+  if (pg_option != backward_options_map.end())
+  {
+     auto it = backward_options_map.find("current_rank");
+     int cur_rank = std::atoi(it->second.c_str());
+     auto help_it = g.outputs.begin();
+     auto output_it = g.outputs.begin();
+     while (output_it!=g.outputs.end())
+     {
+        it = backward_options_map.find(output_it->node ->attrs.name);
+        if (it!=backward_options_map.end() && atoi(it->second.c_str()) != cur_rank)
+        {
+            g.outputs.erase(output_it);
+        }
+        else{
+            output_it++;
+        }
+     }
 
+  }
   const auto& idx = g.indexed_graph();
 
   if (info->bwd_input_eid.size() != inputs.size()) {
@@ -378,7 +399,10 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info,
                                {num_forward_entries, idx.num_node_entries()},
                                detect_inplace_addto);
   g.attrs[AddPrefix(BACKWARD, MEM_PLAN)] = std::make_shared<dmlc::any>(std::move(mem_plan));
-
+  for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ )
+  {
+     std::cout<<"in line 383 graph node " << it->node ->attrs.name << std::endl;
+  }
   return false;
 }
 
@@ -902,7 +926,8 @@ void CachedOp::DynamicBackward(const bool retain_graph,
                                const OpStatePtr& op_state,
                                const std::vector<NDArray*>& inputs,
                                const std::vector<OpReqType>& reqs,
-                               const std::vector<NDArray*>& outputs) {
+                               const std::vector<NDArray*>& outputs,
+                               const std::unordered_map<std::string, std::string> backward_options_map) {
   using namespace nnvm;
   using namespace imperative;
 
@@ -915,11 +940,20 @@ void CachedOp::DynamicBackward(const bool retain_graph,
     std::lock_guard<std::mutex> lock(state.mutex);
     state.info.fwd_graph = runtime.info.fwd_graph;
     state.info.input_map = runtime.info.input_map;
-    SetBackwardGraph(&state.info, reqs, inputs);
+    std::cout<<"in line 918 in cached_op.cc" << std::endl;
+    SetBackwardGraph(&state.info, reqs, inputs, false, backward_options_map);
     runtime.info.full_graph    = state.info.full_graph;
     runtime.info.bwd_input_eid = state.info.bwd_input_eid;
   }
   nnvm::Graph& g  = runtime.info.full_graph;
+  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++)
+  {
+    std::cout<<"in line 930 backward:" << it->first<<":"<<it->second << std::endl;
+  }
+  for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ )
+  {
+     std::cout<<"in line 926 graph node " << it->node ->attrs.name << std::endl;
+  }
   const auto& idx = g.indexed_graph();
   auto& buff      = runtime.buff;
   auto& states    = runtime.op_states;
@@ -1005,7 +1039,8 @@ void CachedOp::StaticBackward(const bool retain_graph,
                               const OpStatePtr& state_ptr,
                               const std::vector<NDArray*>& inputs,
                               const std::vector<OpReqType>& reqs,
-                              const std::vector<NDArray*>& outputs) {
+                              const std::vector<NDArray*>& outputs,
+                              const std::unordered_map<std::string, std::string> backward_options_map) {
   using namespace nnvm;
   using namespace imperative;
 
@@ -1014,9 +1049,15 @@ void CachedOp::StaticBackward(const bool retain_graph,
   auto& state = state_ptr.get_state<CachedOpState>();
   std::lock_guard<std::mutex> lock(state.mutex);
 
-  bool match = SetBackwardGraph(&state.info, reqs, inputs, true);
+  bool match = SetBackwardGraph(&state.info, reqs, inputs, true, backward_options_map);
 
   nnvm::Graph& g         = state.info.full_graph;
+  //deal with options
+  //
+  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
+  {
+    std::cout<< "in line 1034 in cachedop:" << it->first << " : " << it->second<<std::endl;
+  }
   const auto& idx        = g.indexed_graph();
   auto num_forward_nodes = state.info.fwd_graph.indexed_graph().num_nodes();
 
@@ -1090,7 +1131,8 @@ void CachedOp::Backward(const bool retain_graph,
                         const OpStatePtr& state,
                         const std::vector<NDArray*>& inputs,
                         const std::vector<OpReqType>& reqs,
-                        const std::vector<NDArray*>& outputs) {
+                        const std::vector<NDArray*>& outputs,
+                        const std::unordered_map<std::string, std::string> backward_options_map) {
   const auto& fwd_idx             = fwd_graph_.indexed_graph();
   const auto& full_idx            = full_graph_.indexed_graph();
   const auto& mutable_input_nodes = fwd_idx.mutable_input_nodes();
@@ -1121,9 +1163,11 @@ void CachedOp::Backward(const bool retain_graph,
 
   try {
     if (config_.static_alloc) {
-      StaticBackward(retain_graph, state, inputs, reqs, outputs);
+      std::cout<<"call static back"<<std::endl;
+      StaticBackward(retain_graph, state, inputs, reqs, outputs, backward_options_map);
     } else {
-      DynamicBackward(retain_graph, state, inputs, reqs, outputs);
+      std::cout<<"call dynamic back"<<std::endl;
+      DynamicBackward(retain_graph, state, inputs, reqs, outputs, backward_options_map);
     }
   } catch (const dmlc::Error& e) {
     Engine::Get()->set_bulk_size(prev_bulk_size);
@@ -1261,7 +1305,7 @@ void CachedOpBackward(const OpStatePtr& state_ptr,
   // pass a flag to determine whether to record computation inside an operator.
   // Let's use false here for now and design a solution when the second-order
   // differentiation is supported.
-  s.op->Backward(false, s.forward_state, in_ptrs, req, out_ptrs);
+  s.op->Backward(false, s.forward_state, in_ptrs, req, out_ptrs, {});
   Imperative::Get()->set_is_training(orig_is_train);
 
   // Clean up what we recorded.
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index 079a56e20a12..e8d441114a0c 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -501,7 +501,8 @@ class CachedOp {
                         const OpStatePtr& state,
                         const std::vector<NDArray*>& inputs,
                         const std::vector<OpReqType>& reqs,
-                        const std::vector<NDArray*>& outputs);
+                        const std::vector<NDArray*>& outputs,
+                        const std::unordered_map<std::string, std::string> backward_options_map);
   // backward storage type inference
   virtual bool BackwardStorageType(const nnvm::NodeAttrs& attrs,
                                    const int dev_mask,
@@ -606,7 +607,8 @@ class CachedOp {
   bool SetBackwardGraph(GraphInfo* info,
                         const std::vector<OpReqType>& reqs,
                         const std::vector<NDArray*>& inputs,
-                        bool detect_inplace_addto = false);
+                        bool detect_inplace_addto = false,
+                        const std::unordered_map<std::string, std::string> backward_options_map = {});
   bool CheckDynamicShapeExists(const Context& default_ctx,
                                const std::vector<NDArray*>& inputs,
                                bool erase_result);
@@ -632,12 +634,14 @@ class CachedOp {
                        const OpStatePtr& op_state,
                        const std::vector<NDArray*>& inputs,
                        const std::vector<OpReqType>& reqs,
-                       const std::vector<NDArray*>& outputs);
+                       const std::vector<NDArray*>& outputs,
+                       const std::unordered_map<std::string, std::string> backward_options_map);
   void StaticBackward(const bool retain_graph,
                       const OpStatePtr& state_ptr,
                       const std::vector<NDArray*>& inputs,
                       const std::vector<OpReqType>& reqs,
-                      const std::vector<NDArray*>& outputs);
+                      const std::vector<NDArray*>& outputs,
+                      const std::unordered_map<std::string, std::string> backward_options_map);
   size_t BwdOriginalInput(const std::vector<size_t>& input_map, size_t new_i);
 
   CachedOpConfig config_;
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index b9bdaac9476f..bef07a635d17 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -440,7 +440,8 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
                                            const std::vector<NDArray*>& variables,
                                            bool is_train,
                                            bool retain_graph,
-                                           bool create_graph) {
+                                           bool create_graph,
+                                           const std::unordered_map<std::string, std::string> backward_options_map) {
   using namespace nnvm;
   using namespace imperative;
   static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
@@ -519,7 +520,7 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
   for (const auto& i : nleaf_vars) {
     us.emplace_back(NodeEntry{i, 0, 0});
   }
-
+  std::cout<<"in imperative.cc in line 522"<<std::endl;
   Graph g_graph = pass::MXGradient(graph,
                                    graph.outputs,
                                    xs,
@@ -542,6 +543,10 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
       graph.outputs.push_back(e);
     }
   }
+  for (auto it = graph.outputs.begin(); it != graph.outputs.end(); it++ )
+  {
+     std::cout<<"in line 547 in imperative.cc graph node " << it->node ->attrs.name << std::endl;
+  }
   const auto& idx = graph.indexed_graph();
   // get number of nodes used in forward pass
   size_t num_forward_nodes   = 0;
@@ -712,7 +717,10 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
   bool prev_recording = set_is_recording(create_graph);
   bool prev_training  = set_is_training(is_train);
   int prev_bulk_size  = Engine::Get()->set_bulk_size(backward_bulk_size_);
-
+  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
+  {
+    std::cout<< "in line 722 in imperative.cc:" << it->first << " : " << it->second<<std::endl;
+  }
   try {
     RunGraph(retain_graph,
              idx,
@@ -723,7 +731,11 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
              std::move(ref_count),
              &states,
              dispatch_modes,
-             is_recording());
+             is_recording(),
+             nullptr,
+             nullptr,
+             false,
+             backward_options_map);
   } catch (const dmlc::Error& e) {
     Engine::Get()->set_bulk_size(prev_bulk_size);
     set_is_recording(prev_recording);
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
index e3a58804d8ac..812318dee5f5 100644
--- a/src/imperative/imperative_utils.cc
+++ b/src/imperative/imperative_utils.cc
@@ -75,7 +75,8 @@ void InvokeOperator(const nnvm::IndexedGraph& idx,
                     const std::vector<NDArray*>& ndoutputs,
                     std::vector<OpReqType>* p_req,
                     std::vector<uint32_t>* p_ref_count,
-                    std::function<void(const OpStatePtr& state)> invoke) {
+                    std::function<void(const OpStatePtr& state)> invoke,
+                    const std::unordered_map<std::string, std::string> backward_options_map) {
   static const auto bwd_cached_op  = Op::Get("_backward_CachedOp");
   static auto& createop            = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
   static auto& is_layer_backward   = Op::GetAttr<bool>("TIsLayerOpBackward");
@@ -88,7 +89,11 @@ void InvokeOperator(const nnvm::IndexedGraph& idx,
     const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
     nnvm::Node* fwd_node  = node.source->control_deps[0].get();
     auto fwd_node_id      = idx.node_id(fwd_node);
-    cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
+    for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
+  {
+    std::cout<< "in line 94 in imperative_utils.cc:" << it->first << " : " << it->second<<std::endl;
+  }
+    cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs, backward_options_map);
   } else if (createop.count(node.source->op())) {
     mxnet::ShapeVector arg_shapes;
     nnvm::DTypeVector arg_dtypes;
@@ -138,7 +143,8 @@ void RunGraph(const bool retain_graph,
               bool recording,
               mxnet::ShapeVector* shapes,
               const imperative::CachedOpMonCallback& callback,
-              const bool monitor_all) {
+              const bool monitor_all,
+              const std::unordered_map<std::string, std::string> backward_options_map) {
   CHECK(shapes == nullptr);
   for (size_t i = node_start; i < node_end; ++i) {
     const nnvm::IndexedGraph::Node& node = idx[i];
@@ -161,8 +167,12 @@ void RunGraph(const bool retain_graph,
         Imperative::Get()->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, state);
       }
     };
+    for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
+  {
+    std::cout<< "in line 172 in rungraph in imperative_utils.cc:" << it->first << " : " << it->second<<std::endl;
+  }
     InvokeOperator(
-        idx, i, retain_graph, arrays, ctx, p_states, ndinputs, ndoutputs, &req, &ref_count, invoke);
+        idx, i, retain_graph, arrays, ctx, p_states, ndinputs, ndoutputs, &req, &ref_count, invoke, backward_options_map);
     if (callback) {
       mxnet::common::ExecuteMonOutputCallback(idx, arrays, i, callback);
     }
@@ -224,7 +234,7 @@ void NaiveRunGraph(const bool retain_graph,
       }
     };
     InvokeOperator(
-        idx, i, retain_graph, arrays, ctx, p_states, ndinputs, ndoutputs, &req, &ref_count, invoke);
+        idx, i, retain_graph, arrays, ctx, p_states, ndinputs, ndoutputs, &req, &ref_count, invoke, {});
     if (callback) {
       mxnet::common::ExecuteMonOutputCallback(idx, arrays, i, callback);
     }
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index ce1a60fb2b20..7f8e7d7f687b 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -1369,7 +1369,8 @@ void RunGraph(const bool retain_graph,
               bool recording,
               mxnet::ShapeVector* shapes          = nullptr,
               const CachedOpMonCallback& callback = nullptr,
-              const bool monitor_all_             = false);
+              const bool monitor_all_             = false,
+              const std::unordered_map<std::string, std::string> backward_options_map = {});
 
 void NaiveRunGraph(const bool retain_graph,
                    const Context& default_ctx,
diff --git a/src/imperative/naive_cached_op.h b/src/imperative/naive_cached_op.h
index cd1365508f97..037d4d489448 100644
--- a/src/imperative/naive_cached_op.h
+++ b/src/imperative/naive_cached_op.h
@@ -48,7 +48,8 @@ class NaiveCachedOp : public CachedOp {
                 const OpStatePtr& state,
                 const std::vector<NDArray*>& inputs,
                 const std::vector<OpReqType>& reqs,
-                const std::vector<NDArray*>& outputs) override {
+                const std::vector<NDArray*>& outputs,
+                const std::unordered_map<std::string, std::string> backward_options_map) override {
     LOG(FATAL) << "Backward is not supported in NaiveCachedOp.";
   }
   // backward storage type inference
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index ad12d3ded5d2..a5647bcad859 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -274,7 +274,7 @@ void LoopState::Backward(int iter_no,
     outputs.push_back(&igrad_bufs[i]);
   CHECK_EQ(outputs.size(), op->num_inputs());
   auto state = all_states[iter_no];
-  op->Backward(false, state, inputs, req, outputs);
+  op->Backward(false, state, inputs, req, outputs, {});
   // If an input and an output share the array, the output array will be changed
   // by CachedOp. We need to copy data to the real output.
   for (size_t i = 0; i < igrads.size(); i++)

From a31848aaa54106092b7536e94540d8506c688b69 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Mon, 17 Jan 2022 12:45:34 +0000
Subject: [PATCH 13/22] add backward option

---
 example/extensions/lib_pass/test_pass.py | 25 ++++++++++++++++++------
 python/mxnet/ndarray/ndarray.py          |  2 --
 src/c_api/c_api_ndarray.cc               |  5 -----
 src/imperative/cached_op.cc              | 24 ++---------------------
 src/imperative/imperative.cc             | 10 +---------
 src/imperative/imperative_utils.cc       |  9 +--------
 6 files changed, 23 insertions(+), 52 deletions(-)

diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py
index df115eee7853..e4087a185528 100644
--- a/example/extensions/lib_pass/test_pass.py
+++ b/example/extensions/lib_pass/test_pass.py
@@ -95,7 +95,7 @@ class Easynet(nn.HybridBlock):
     def __init__(self):
         super().__init__()
         self.l1 = nn.Dense(in_units=2, units=2, flatten=False)
-        #self.l2 = nn.Dense(in_units=2, units=2, flatten=False)
+        self.l2 = nn.Dense(in_units=2, units=2, flatten=False)
         #self.act1 = get_activation('relu')
 
         #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False))
@@ -105,11 +105,12 @@ def __init__(self):
         #self.l1.register_op_hook(mon_callback,  monitor_all=True)
 
 
-    def forward(self, input):
-        input = self.l1(input)
+    def forward(self, input1, input2):
+        output_1 = self.l1(input1)
+        output_2 = self.l2(input2)
         #print(input)
         #input = self.act1(input)
-        return input
+        return output_1 + output_2
 
 
 def test_model(pass_name):
@@ -144,9 +145,13 @@ def test_model(pass_name):
     if rank == 0:
         model.l1.weight.initialize(init=init.One(), ctx=ctx)
         model.l1.bias.initialize(init=init.One(), ctx=ctx)
+        model.l2.weight.initialize(init=init.One(), ctx=ctx)
+        model.l2.bias.initialize(init=init.One(), ctx=ctx)
     else:
         model.l1.weight.initialize(init = init.Zero(), ctx = ctx)
         model.l1.bias.initialize(init=init.Zero(), ctx=ctx)
+        model.l2.weight.initialize(init=init.Zero(), ctx=ctx)
+        model.l2.bias.initialize(init=init.Zero(), ctx=ctx)
     model.hybridize()
     #param_dict = classify_net.collect_params()
     #params = [p for p in param_dict.values() if p.grad_req != 'null']
@@ -174,24 +179,32 @@ def test_model(pass_name):
 
     options.update(dic)
     print(options)
+
     #return
+    '''
     backward_options = {"partition_grad":True, "current_rank":rank}
     for k in options:
         if k in ['num_gpus', 'rank', 'nccl_unique_id']:
             continue
         backward_options['ncclreduce_' + k + '_backward'] = options[k]
     print(backward_options)
+    '''
+    options = trainer.generate_graph_pass_options()
+    print(options)
+    backward_options = trainer.generate_backward_options()
+    print(backward_options)
     x = np.ones((1,2), ctx = ctx)
+    x2 = np.ones((1, 2), ctx=ctx)
     label = np.ones((2, ), ctx = ctx) * rank
     #print(options)
 
     loss_function = gluon.loss.L2Loss()
 
-    model.optimize_for(x, backend = pass_name, **options)
+    model.optimize_for(x, x2, backend = "myPass", **options)
     #model.export("my_reduce_" + str(rank))
     for i in range(1):
         with mx.autograd.record():
-            out = model(x)
+            out = model(x, x2)
             loss = loss_function(out, label).mean() / size
             print("now call backward in python")
             #print(loss.backward)
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 7ac9f13113cb..e56ff4e34c9e 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -2942,13 +2942,11 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True, backward_
             ograd_handles = [NDArrayHandle(0)]
         else:
             ograd_handles = [out_grad.handle]
-        print("in ndarray.py in line 2945")
         key_list = []
         val_list = []
         for key, val in backward_option.items():
             key_list.append(key)
             val_list.append(str(val))
-        print("in ndarray.py in line 2951", key_list, val_list)
         check_call(_LIB.MXAutogradBackwardEx(
             1, c_handle_array([self]),
             c_array(NDArrayHandle, ograd_handles),
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index b436f7991ac8..0b30373f68b2 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -405,11 +405,6 @@ int MXAutogradBackwardEx(uint32_t num_output,
   for (uint32_t i = 0; i < num_variables; ++i) {
     variables.emplace_back(reinterpret_cast<NDArray*>(var_handles[i]));
   }
-  std::cout<<"in c_api_ndarray.cc in line 400" << std::endl;
-  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
-  {
-    std::cout<< "in line 408 in c_api_ndarray.cc:" << it->first << " : " << it->second<<std::endl;
-  }
   auto grads =
       Imperative::Get()->Backward(outputs, ograds, variables, is_train, retain_graph, create_graph, backward_options_map);
   if (num_variables != 0) {
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 4ad23b72a9c0..c29a7ab96c80 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -286,7 +286,6 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info,
   {
      auto it = backward_options_map.find("current_rank");
      int cur_rank = std::atoi(it->second.c_str());
-     auto help_it = g.outputs.begin();
      auto output_it = g.outputs.begin();
      while (output_it!=g.outputs.end())
      {
@@ -399,10 +398,7 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info,
                                {num_forward_entries, idx.num_node_entries()},
                                detect_inplace_addto);
   g.attrs[AddPrefix(BACKWARD, MEM_PLAN)] = std::make_shared<dmlc::any>(std::move(mem_plan));
-  for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ )
-  {
-     std::cout<<"in line 383 graph node " << it->node ->attrs.name << std::endl;
-  }
+
   return false;
 }
 
@@ -940,20 +936,11 @@ void CachedOp::DynamicBackward(const bool retain_graph,
     std::lock_guard<std::mutex> lock(state.mutex);
     state.info.fwd_graph = runtime.info.fwd_graph;
     state.info.input_map = runtime.info.input_map;
-    std::cout<<"in line 918 in cached_op.cc" << std::endl;
     SetBackwardGraph(&state.info, reqs, inputs, false, backward_options_map);
     runtime.info.full_graph    = state.info.full_graph;
     runtime.info.bwd_input_eid = state.info.bwd_input_eid;
   }
   nnvm::Graph& g  = runtime.info.full_graph;
-  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++)
-  {
-    std::cout<<"in line 930 backward:" << it->first<<":"<<it->second << std::endl;
-  }
-  for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ )
-  {
-     std::cout<<"in line 926 graph node " << it->node ->attrs.name << std::endl;
-  }
   const auto& idx = g.indexed_graph();
   auto& buff      = runtime.buff;
   auto& states    = runtime.op_states;
@@ -1052,12 +1039,7 @@ void CachedOp::StaticBackward(const bool retain_graph,
   bool match = SetBackwardGraph(&state.info, reqs, inputs, true, backward_options_map);
 
   nnvm::Graph& g         = state.info.full_graph;
-  //deal with options
-  //
-  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
-  {
-    std::cout<< "in line 1034 in cachedop:" << it->first << " : " << it->second<<std::endl;
-  }
+
   const auto& idx        = g.indexed_graph();
   auto num_forward_nodes = state.info.fwd_graph.indexed_graph().num_nodes();
 
@@ -1163,10 +1145,8 @@ void CachedOp::Backward(const bool retain_graph,
 
   try {
     if (config_.static_alloc) {
-      std::cout<<"call static back"<<std::endl;
       StaticBackward(retain_graph, state, inputs, reqs, outputs, backward_options_map);
     } else {
-      std::cout<<"call dynamic back"<<std::endl;
       DynamicBackward(retain_graph, state, inputs, reqs, outputs, backward_options_map);
     }
   } catch (const dmlc::Error& e) {
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index bef07a635d17..173bb33f744a 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -520,7 +520,6 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
   for (const auto& i : nleaf_vars) {
     us.emplace_back(NodeEntry{i, 0, 0});
   }
-  std::cout<<"in imperative.cc in line 522"<<std::endl;
   Graph g_graph = pass::MXGradient(graph,
                                    graph.outputs,
                                    xs,
@@ -543,10 +542,6 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
       graph.outputs.push_back(e);
     }
   }
-  for (auto it = graph.outputs.begin(); it != graph.outputs.end(); it++ )
-  {
-     std::cout<<"in line 547 in imperative.cc graph node " << it->node ->attrs.name << std::endl;
-  }
   const auto& idx = graph.indexed_graph();
   // get number of nodes used in forward pass
   size_t num_forward_nodes   = 0;
@@ -717,10 +712,7 @@ std::vector<NDArray*> Imperative::Backward(const std::vector<NDArray*>& outputs,
   bool prev_recording = set_is_recording(create_graph);
   bool prev_training  = set_is_training(is_train);
   int prev_bulk_size  = Engine::Get()->set_bulk_size(backward_bulk_size_);
-  for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
-  {
-    std::cout<< "in line 722 in imperative.cc:" << it->first << " : " << it->second<<std::endl;
-  }
+
   try {
     RunGraph(retain_graph,
              idx,
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
index 812318dee5f5..c6e9b9124234 100644
--- a/src/imperative/imperative_utils.cc
+++ b/src/imperative/imperative_utils.cc
@@ -89,10 +89,6 @@ void InvokeOperator(const nnvm::IndexedGraph& idx,
     const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
     nnvm::Node* fwd_node  = node.source->control_deps[0].get();
     auto fwd_node_id      = idx.node_id(fwd_node);
-    for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
-  {
-    std::cout<< "in line 94 in imperative_utils.cc:" << it->first << " : " << it->second<<std::endl;
-  }
     cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs, backward_options_map);
   } else if (createop.count(node.source->op())) {
     mxnet::ShapeVector arg_shapes;
@@ -167,10 +163,7 @@ void RunGraph(const bool retain_graph,
         Imperative::Get()->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, state);
       }
     };
-    for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ )
-  {
-    std::cout<< "in line 172 in rungraph in imperative_utils.cc:" << it->first << " : " << it->second<<std::endl;
-  }
+
     InvokeOperator(
         idx, i, retain_graph, arrays, ctx, p_states, ndinputs, ndoutputs, &req, &ref_count, invoke, backward_options_map);
     if (callback) {

From 70ee7be84f4f755ada3be74b9552a2a01cd361f9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Fri, 21 Jan 2022 08:51:39 +0000
Subject: [PATCH 14/22] return to intial version

---
 example/extensions/lib_pass/pass_lib.cc | 146 +-----------------------
 1 file changed, 1 insertion(+), 145 deletions(-)

diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc
index df7ac8fa079f..d219299ac9ad 100644
--- a/example/extensions/lib_pass/pass_lib.cc
+++ b/example/extensions/lib_pass/pass_lib.cc
@@ -25,164 +25,20 @@
 #include <cmath>
 #include <iostream>
 #include <algorithm>
-#include <string>
 #include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 
 /* \brief a basic pass that prints out the options and the graph */
-/*
 MXReturnValue myPass(mxnet::ext::Graph* g,
                      const std::unordered_map<std::string, std::string>& options) {
-  int cur_rank = -1;
-  std::string nccl_unique_id = "asd";
-  for (auto kv : options) {
-    std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;}
-
-  std::cout<<" print options" << std::endl;
-  size_t length = g->size();
-  mxnet::ext::Node *tmp;
-  for (int i = 0;i < length; i += 1)
-  {
-    tmp = g->getNode(i);
-    std::cout<<tmp->name<<std::endl;
-    auto attrs = tmp->attrs;
-    for (auto it = attrs.begin(); it!=attrs.end(); it ++ )
-  {
-    std::cout<<tmp->name<<":attrs:"<<it->first<<" : "<<it->second<<std::endl;
-  }
-  }
-  //g->addNode("myConv","_contrib_NCCLReduce");
-  /*
-  //g->addNode("myConv","Convolution");
-  mxnet::ext::Node *copy_old_layer = g->getNode(g->size()-2);
-  mxnet::ext::Node *old_layer = g->getNode(g->size()-1);
-  std::cout<<old_layer->name<<std::endl;
-
-  mxnet::ext::Node *new_weight = g->addNode("myweight3","null");
-  new_weight->alloc_arg({3,2}, MXContext::CPU(0),kFloat32);
-
-  mxnet::ext::Node *new_bias = g->addNode("mybias3","null");
-  new_bias->alloc_arg({3,}, MXContext::CPU(0),kFloat32);
-
-
-
-
-  //new_bias->alloc_aux({2}, MXContext::CPU(0), kInt32);
-
-
-  auto attrs = copy_old_layer->attrs;
-  mxnet::ext::Node *new_layer = g->addNode("mylinaer","FullyConnected");
-  auto new_attrs = &new_layer->attrs;
-  for (auto it = attrs.begin(); it!=attrs.end(); it ++ )
-  {
-
-    std::cout<<it->first<<" : "<<it->second<<std::endl;
-    if (it->first!="num_hidden"){
-    new_attrs->insert({{it->first,it->second}});}
-  }
-  new_attrs->insert({{"num_hidden","3"}});
-
-  std::cout<<"current:"<<std::endl;
-
-
-
-  for (auto it = new_attrs->begin(); it!=new_attrs->end(); it ++ )
-  {
-    std::cout<<it->first<<" : "<<it->second<<std::endl;
-  }
-
-  new_layer->inputs.push_back({old_layer, 0});
-  new_layer->inputs.push_back({new_weight, 0});
-  new_layer->inputs.push_back({new_bias, 0});
-  old_layer->outputs.push_back({new_layer, 0});
-  new_weight->outputs.push_back({new_layer, 1});
-  new_bias->outputs.push_back({new_layer, 2});
-  auto it = g->outputs.begin();
-  g->outputs.erase(it);
-  mxnet::ext::NodeEntry new_entry;
-  new_entry.node = new_layer;
-  new_entry.entry = 0;
-  g->outputs.push_back(new_entry);
-
-  g->print();
-
-
-  return MX_SUCCESS;
-}
-*/
-
-
-MXReturnValue myPass(mxnet::ext::Graph* g,
-                     const std::unordered_map<std::string, std::string>& options) {
-  std::string cur_rank = "";
-  std::string nccl_unique_id = "";
-  std::string num_gpus = "";
   for (auto kv : options) {
     std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
-    if (kv.first == "rank")
-    {
-        cur_rank = kv.second.c_str();
-    }
-    if (kv.first == "nccl_unique_id")
-        nccl_unique_id = kv.second.c_str();
-    if (kv.first == "num_gpus")
-        num_gpus = kv.second.c_str();
-    }
-  std::cout<<"nccl:"<<nccl_unique_id<<std::endl;
-
-  size_t length = g->size();
-  mxnet::ext::Node *tmp;
-  std::string root_rank;
-  mxnet::ext::Node *target_node;
-  int index = 0;
-  for (int i = 0;i < length; i += 1)
-  {
-    target_node = g->getNode(i);
-    std::cout<<"deal with:" << target_node->name<<std::endl;
-    auto it = options.find(target_node->name);
-    if (it == options.end()) {continue;} // req_grad == null
-    root_rank = it->second;
-    mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce");
-    index += 1;
-    auto new_attrs = &new_reduce->attrs;
-    auto old_attrs = target_node->attrs;
-    for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++)
-    {
-        if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__")
-        {
-            new_attrs ->insert({{it->first, it->second}});
-        }
-    }
-    new_attrs->insert({{"nccl_unique_id", nccl_unique_id}});
-    new_attrs->insert({{"num_gpus", num_gpus}});
-    new_attrs->insert({{"rank", cur_rank}});
-    new_attrs->insert({{"root_rank", root_rank}});
-
-  for (int i=0;i<target_node->outputs.size(); i++)
-  {
-     new_reduce->outputs.push_back(target_node->outputs[i]);
-     mxnet::ext::Node *output_node = target_node->outputs[i].node;
-     int index = target_node->outputs[i].entry;
-     //std::cout<<"try change:"<<output_node->name<<":"<<output_node->inputs.size()<<std::endl;
-     output_node->inputs[index].node = new_reduce;
-  }
-  for (int i=0;i<target_node->outputs.size(); i++)
-  {
-     target_node->outputs.pop_back();
-  }
-  target_node->outputs.push_back({new_reduce, 0});
-  new_reduce->inputs.push_back({target_node, 0});
-
   }
   g->print();
-
-
   return MX_SUCCESS;
 }
 
-
-
 REGISTER_PASS(myPass).setBody(myPass);
 
 MXReturnValue initialize(int version) {
@@ -193,4 +49,4 @@ MXReturnValue initialize(int version) {
     MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl;
     return MX_FAIL;
   }
-}
+}
\ No newline at end of file

From dacd88fa6f3118e42cb2e8430daff62496bcef05 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Fri, 21 Jan 2022 08:51:52 +0000
Subject: [PATCH 15/22] return to intial version

---
 example/extensions/lib_pass/test_pass.py | 248 ++---------------------
 1 file changed, 16 insertions(+), 232 deletions(-)

diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py
index e4087a185528..3fa1fd22b891 100644
--- a/example/extensions/lib_pass/test_pass.py
+++ b/example/extensions/lib_pass/test_pass.py
@@ -27,15 +27,6 @@
 import mxnet as mx
 from mxnet.gluon import nn
 from mxnet import nd
-import numpy as np
-from mxnet.lr_scheduler import PolyScheduler
-from mxnet import np, npx
-from pos_trainer import POS_Trainer
-try:
-    import horovod.mxnet as hvd
-except ImportError:
-    pass
-#from gluonnlp.layers import get_activation
 from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle
 
 # load library
@@ -46,40 +37,6 @@
     path = os.path.abspath('libpass_lib.dll')
     mx.library.load(path)
 
-
-class _NCCLReduceHelper(object):
-    _init = False
-    nccl_id = None
-    num_gpus = None
-    rank = None
-
-    @staticmethod
-    def init(num_gpus, root_rank):
-        """Communicate the NCCL unique id"""
-        cls = _NCCLReduceHelper
-        if not cls._init:
-            cls._init = True
-            import ctypes
-            try:
-                from mpi4py import MPI
-            except:
-                raise ImportError("Spatial parallel modules require mpi4py package.")
-            import numpy as np
-            nccl_id_size = ctypes.c_int()
-            check_call(_LIB.MXNCCLGetUniqueIdSize(ctypes.byref(nccl_id_size)))
-            nccl_id_size = nccl_id_size.value
-            cls.nccl_id = np.zeros(nccl_id_size, np.byte)
-            check_call(_LIB.MXNCCLGetUniqueId(
-                cls.nccl_id.ctypes.data_as(ctypes.c_void_p)))
-            global_comm = MPI.COMM_WORLD
-            rank = global_comm.rank
-            color = rank / num_gpus
-            comm = global_comm.Split(color, rank)
-            comm.Bcast([cls.nccl_id, nccl_id_size, MPI.BYTE], root=0)
-            cls.num_gpus = num_gpus
-            cls.rank = rank % num_gpus
-            cls.root_rank = root_rank % num_gpus
-        assert num_gpus == cls.num_gpus
 ###############################################
 # Test with not consuming params
 ###############################################
@@ -90,197 +47,24 @@ def init(num_gpus, root_rank):
 d = mx.sym.exp(c)
 sym = mx.sym.log(d)
 
-
-class Easynet(nn.HybridBlock):
-    def __init__(self):
-        super().__init__()
-        self.l1 = nn.Dense(in_units=2, units=2, flatten=False)
-        self.l2 = nn.Dense(in_units=2, units=2, flatten=False)
-        #self.act1 = get_activation('relu')
-
-        #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False))
-        #self.seq.add(get_activation('relu'))
-        #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False))
-        #self.seq.register_op_hook(mon_callback,  monitor_all=True)
-        #self.l1.register_op_hook(mon_callback,  monitor_all=True)
-
-
-    def forward(self, input1, input2):
-        output_1 = self.l1(input1)
-        output_2 = self.l2(input2)
-        #print(input)
-        #input = self.act1(input)
-        return output_1 + output_2
-
-
 def test_model(pass_name):
-    from mxnet import gluon
-    from mxnet.gluon import Block, nn, HybridBlock
-    from mxnet import init
-
-
-
-    hvd.init()
-    rank = hvd.rank()
-    size = hvd.size()
-    ctx = mx.gpu(rank)
-
-    np.random.seed(1234 + 10 * rank)
-    mx.random.seed(1234 + 10 * rank)
-
-    num_gpus = size
-    root_rank = 0
-    helper = _NCCLReduceHelper
-    helper.init(num_gpus, 0)
-    kwargs = {
-        'num_gpus': num_gpus,
-        'rank': helper.rank,
-        'root_rank': helper.root_rank,
-        'nccl_unique_id': helper.nccl_id.ctypes.data}
-
-    options = {"num_gpus":num_gpus, "rank": rank, "nccl_unique_id": helper.nccl_id.ctypes.data}
-
-    model = Easynet()
-
-    if rank == 0:
-        model.l1.weight.initialize(init=init.One(), ctx=ctx)
-        model.l1.bias.initialize(init=init.One(), ctx=ctx)
-        model.l2.weight.initialize(init=init.One(), ctx=ctx)
-        model.l2.bias.initialize(init=init.One(), ctx=ctx)
-    else:
-        model.l1.weight.initialize(init = init.Zero(), ctx = ctx)
-        model.l1.bias.initialize(init=init.Zero(), ctx=ctx)
-        model.l2.weight.initialize(init=init.Zero(), ctx=ctx)
-        model.l2.bias.initialize(init=init.Zero(), ctx=ctx)
-    model.hybridize()
-    #param_dict = classify_net.collect_params()
-    #params = [p for p in param_dict.values() if p.grad_req != 'null']
-    params = model.collect_params()
-    index = 0
-    dic = {}
-    lr_scheduler = PolyScheduler(max_update=1,
-                                 base_lr=1e-3,
-                                 warmup_begin_lr=0.0,
-                                 pwr=1,
-                                 final_lr=0.0,
-                                 warmup_steps=0,
-                                 warmup_mode='linear')
-    optimizer_params = {'learning_rate': 1e-3,
-                        'wd': 1e-2,
-                        'lr_scheduler': lr_scheduler}
-    trainer = POS_Trainer(params, "adam", optimizer_params)
-    cor_rank = trainer.correspond_ranks()
-
-    for name in params:
-        type = name.split('.')[-1]
-        index = cor_rank[params[name]._uuid]
-        new_name = params[name]._uuid.replace('-', '_') + '_' + type
-        dic[new_name] = index
-
-    options.update(dic)
-    print(options)
-
-    #return
-    '''
-    backward_options = {"partition_grad":True, "current_rank":rank}
-    for k in options:
-        if k in ['num_gpus', 'rank', 'nccl_unique_id']:
-            continue
-        backward_options['ncclreduce_' + k + '_backward'] = options[k]
-    print(backward_options)
-    '''
-    options = trainer.generate_graph_pass_options()
-    print(options)
-    backward_options = trainer.generate_backward_options()
-    print(backward_options)
-    x = np.ones((1,2), ctx = ctx)
-    x2 = np.ones((1, 2), ctx=ctx)
-    label = np.ones((2, ), ctx = ctx) * rank
-    #print(options)
-
-    loss_function = gluon.loss.L2Loss()
-
-    model.optimize_for(x, x2, backend = "myPass", **options)
-    #model.export("my_reduce_" + str(rank))
-    for i in range(1):
-        with mx.autograd.record():
-            out = model(x, x2)
-            loss = loss_function(out, label).mean() / size
-            print("now call backward in python")
-            #print(loss.backward)
-            #print(loss)
-            #print(loss.backward)
-            loss.backward(backward_option = backward_options)
-    mx.npx.waitall()
-    mx.nd.waitall()
-    for name in params:
-        print(name, params[name].list_grad()[0])
-
-
-    #print(out)
-
-
-def test_reduce(pass_name):
-    from mxnet import gluon
-    from mxnet.gluon import Block, nn, HybridBlock
-    from mxnet import init
-
-    hvd.init()
-    rank = hvd.rank()
-    size = hvd.size()
-    ctx = mx.gpu(rank)
-
-    num_gpus = size
-    root_rank = 0
-    helper = _NCCLReduceHelper
-    helper.init(num_gpus, root_rank)
-    kwargs = {
-        'num_gpus': num_gpus,
-        'rank': helper.rank,
-        'root_rank': helper.root_rank,
-        'nccl_unique_id': helper.nccl_id.ctypes.data}
-
-    options = {"rank": rank, "nccl_unique_id":helper.nccl_id.ctypes.data}
-
-    a = mx.sym.var('a')
-    sym = mx.sym.contrib.NCCLReduce(a, **kwargs)
-
-    inputs = [a]
+    args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))}
+    # execute in MXNet
+    print('-------------------------------')
+    print('Testing regular MXNet execution')
+    inputs = [a,b]
     sym_block = nn.SymbolBlock(sym, inputs)
-    sym_block.initialize(ctx = ctx)
-    p = mx.nd.ones((2, 3), ctx=ctx) * (rank + 2)
-    sym_block.optimize_for(p, backend = pass_name, **options)
-    sym_block.export("my_choice")
-    return
-
-    print(p)
-    out = sym_block(p)
-    mx.nd.waitall()
-    mx.npx.waitall()
+    sym_block.initialize()
+    out = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
     print(out)
-    return
-    class ReduceNet(nn.HybridBlock):
-        def __init__(self):
-            super().__init__()
-            self.reduce_layer = NCCLReduce(num_gpus=2, root_rank=0)
-        def forward(self, x):
-            x = self.reduce_layer(x)
-            return x
-
-    net = ReduceNet()
-    net.initialize(ctx=ctx)
-    net.hybridize()
-    data = mx.nd.ones((3,2), ctx = ctx)
-    net(data)
-    #data = mx.np.ones((1, 10), ctx=ctx) * rank
-    a = mx.sym.var('a')
-    out = net(a)
-    inputs = [a]
-    out_sym = nn.SymbolBlock(out, inputs)
-    out_sym.initialize(ctx = ctx)
-    true_out = out_sym(mx.nd.ones((3,2)))
-
-    print(true_out)
 
+    # Gluon optimize_for
+    print('-------------------------------')
+    print('Testing pass "%s" Gluon Hybridize with shapes/types without inference' % pass_name)
+    inputs = [a,b]
+    sym_block2 = nn.SymbolBlock(sym, inputs)
+    sym_block2.initialize()
+    sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend=pass_name)
+    sym_block2.export('modified')
 
-test_model('myPass')
+test_model('myPass')
\ No newline at end of file

From 4936c46597c42d30313e9b0888af1ffa04bdbd41 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Fri, 21 Jan 2022 09:21:57 +0000
Subject: [PATCH 16/22] add new dir

---
 .../extensions/lib_reduce_gradient/Makefile   |  24 +++
 .../extensions/lib_reduce_gradient/README.md  |  66 ++++++
 .../lib_reduce_gradient/add_reduce_op.cc      | 115 ++++++++++
 .../lib_reduce_gradient/pos_trainer.py        | 201 ++++++++++++++++++
 .../lib_reduce_gradient/test_reduce.py        | 129 +++++++++++
 5 files changed, 535 insertions(+)
 create mode 100644 example/extensions/lib_reduce_gradient/Makefile
 create mode 100644 example/extensions/lib_reduce_gradient/README.md
 create mode 100644 example/extensions/lib_reduce_gradient/add_reduce_op.cc
 create mode 100644 example/extensions/lib_reduce_gradient/pos_trainer.py
 create mode 100644 example/extensions/lib_reduce_gradient/test_reduce.py

diff --git a/example/extensions/lib_reduce_gradient/Makefile b/example/extensions/lib_reduce_gradient/Makefile
new file mode 100644
index 000000000000..95b2fdc4d9bc
--- /dev/null
+++ b/example/extensions/lib_reduce_gradient/Makefile
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+all: pass_lib
+
+pass_lib:
+	g++ -shared -fPIC -std=c++11 add_reduce_op.cc ../../../src/lib_api.cc -o add_reduce_op_lib.so -I ../../../include
+
+clean:
+	rm -rf libpass_lib.so
diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md
new file mode 100644
index 000000000000..85cfb3d7b16f
--- /dev/null
+++ b/example/extensions/lib_reduce_gradient/README.md
@@ -0,0 +1,66 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  ~
+-->
+
+Add Reduce operation to computation Graph
+=======================================
+
+## Introduction
+This is the part of work of transferring [DeepSpeed's work](https://arxiv.org/abs/1910.02054) into MXNet.
+Since the difference between symbolic and imperative, we divide the whole proecss into two phases:  
+
+phase 1: Add reduce operation into graph. The reduce operation will do nothing
+in forward but reduce the gradient to the right GPU(according to POS-trainer).  
+
+phase2: In backward graph, delete the outputs in arrays so the memory planner can reuse such memory.  
+
+ ## Getting start 
+ ### Prepare NCCL and horovod
+ Since we use horovod to communicate, please firstly install horovod. And we use NCCL reduce, please also install it.  
+ 
+ ### Complie the Graph Pass and load
+ Please firstly compile it like [lib pass](../lib_pass/). Run `make` and it will generate dynamic library
+ **add_reduce_op_lib.so**  which is compiled from the `add_reduce_op.cc` file. Then load such file in your python code like
+```python
+import mxnet as mx
+mx.library.load('add_reduce_op_lib.so')
+```
+ 
+ ### Prepare options
+ Then we need know the correct partition of parameters and gradients about their GPUs.
+ So please use **POS_Trainer** from `pos_trainer.py` like normal trainer in MXNet.
+ ```python
+from pos_trainer import POS_Trainer
+trainer = POS_Trainer(params_dict, "adam", optimizer_params)
+```
+Then trainer can generate corresponding options like:
+ ```python
+options = trainer.generate_graph_pass_options()
+backward_options = trainer.generate_backward_options()]
+```
+### modify graph
+Before forward, we use 
+ ```python
+model.optimize_for(x, backend = "add_reduce_op", **options)
+```
+to insert reduce operation into graphs.   
+Then we call backward option as 
+ ```python
+loss.backward(backward_option = backward_options)
+```
\ No newline at end of file
diff --git a/example/extensions/lib_reduce_gradient/add_reduce_op.cc b/example/extensions/lib_reduce_gradient/add_reduce_op.cc
new file mode 100644
index 000000000000..8faad796029a
--- /dev/null
+++ b/example/extensions/lib_reduce_gradient/add_reduce_op.cc
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file subgraph_lib.cc
+ * \brief subgraph operator implementation library file
+ */
+
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include "mxnet/lib_api.h"
+
+using namespace mxnet::ext;
+
+
+
+MXReturnValue add_reduce_op(mxnet::ext::Graph* g,
+                     const std::unordered_map<std::string, std::string>& options) {
+  std::string cur_rank = "";
+
+  std::string num_gpus = "";
+  std::string nccl_unique_id = "";
+
+  for (auto kv : options) {
+    std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
+    if (kv.first == "rank")
+    {
+        cur_rank = kv.second.c_str();
+    }
+    if (kv.first == "nccl_unique_id")
+        nccl_unique_id = kv.second.c_str();
+    if (kv.first == "num_gpus")
+        num_gpus = kv.second.c_str();
+    }
+  size_t length = g->size();
+  mxnet::ext::Node *tmp;
+  std::string root_rank;
+  mxnet::ext::Node *target_node;
+  int index = 0;
+  for (int i = 0;i < length; i += 1)
+  {
+    target_node = g->getNode(i);
+    //std::cout<<"deal with:" << target_node->name<<std::endl;
+    auto it = options.find(target_node->name);
+    if (it == options.end()) {continue;} // req_grad == null
+    root_rank = it->second;
+    mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce");
+    index += 1;
+    auto new_attrs = &new_reduce->attrs;
+    auto old_attrs = target_node->attrs;
+    for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++)
+    {
+        if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__")
+        {
+            new_attrs ->insert({{it->first, it->second}});
+        }
+    }
+    new_attrs->insert({{"nccl_unique_id", nccl_unique_id}});
+    new_attrs->insert({{"num_gpus", num_gpus}});
+    new_attrs->insert({{"rank", cur_rank}});
+    new_attrs->insert({{"root_rank", root_rank}});
+
+  for (int i=0;i<target_node->outputs.size(); i++)
+  {
+     new_reduce->outputs.push_back(target_node->outputs[i]);
+     mxnet::ext::Node *output_node = target_node->outputs[i].node;
+     int index = target_node->outputs[i].entry;
+     //std::cout<<"try change:"<<output_node->name<<":"<<output_node->inputs.size()<<std::endl;
+     output_node->inputs[index].node = new_reduce;
+  }
+  for (int i=0;i<target_node->outputs.size(); i++)
+  {
+     target_node->outputs.pop_back();
+  }
+  target_node->outputs.push_back({new_reduce, 0});
+  new_reduce->inputs.push_back({target_node, 0});
+
+  }
+  g->print();
+
+
+  return MX_SUCCESS;
+}
+
+
+
+REGISTER_PASS(add_reduce_op).setBody(add_reduce_op);
+
+MXReturnValue initialize(int version) {
+  if (version >= 10700) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl;
+    return MX_FAIL;
+  }
+}
diff --git a/example/extensions/lib_reduce_gradient/pos_trainer.py b/example/extensions/lib_reduce_gradient/pos_trainer.py
new file mode 100644
index 000000000000..357289a8ab07
--- /dev/null
+++ b/example/extensions/lib_reduce_gradient/pos_trainer.py
@@ -0,0 +1,201 @@
+# POS_Trainer is the stage one : partition optmizer status in DeepSpeed's work
+# It can reduce memory consumption in distributed Trainer but slower
+# since we can not solve overlapping problem when calling broadcast and optimize parameters.
+# The usage of this trainer is totally same with original one
+# I test some benchmark Here:
+# For 4 V100 Gpu with 16GB memory, the maximum batch size for bert-large and bert-base:
+# bert-large: Original: 16 Pos: 24
+# bert-base: Original: 64 Pos: 80
+# The ideal average saving memory for each GPU is: (N-1)/N * P * K
+# where N is the GPU number, P is the parameter number and K is the memory
+# multiplier of optimizer states(E.g. for Adam, K = 12)
+#TODO add group_num
+from horovod.mxnet.mpi_ops import allreduce, allreduce_
+from horovod.mxnet.mpi_ops import broadcast, broadcast_
+from horovod.mxnet.mpi_ops import init, shutdown
+from horovod.mxnet.mpi_ops import size, local_size,  rank, local_rank
+from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle
+
+import mxnet as mx
+from collections import OrderedDict, defaultdict
+import types
+import time
+import warnings
+from mxnet.gluon.parameter import Parameter
+from horovod.mxnet.mpi_ops import ProcessSet, global_process_set, add_process_set, remove_process_set
+
+class _NCCLReduceHelper(object):
+    _init = False
+    nccl_id = None
+    num_gpus = None
+    rank = None
+
+    @staticmethod
+    def init(num_gpus, root_rank):
+        """Communicate the NCCL unique id"""
+        cls = _NCCLReduceHelper
+        if not cls._init:
+            cls._init = True
+            import ctypes
+            try:
+                from mpi4py import MPI
+            except:
+                raise ImportError("Spatial parallel modules require mpi4py package.")
+            import numpy as np
+            nccl_id_size = ctypes.c_int()
+            check_call(_LIB.MXNCCLGetUniqueIdSize(ctypes.byref(nccl_id_size)))
+            nccl_id_size = nccl_id_size.value
+            cls.nccl_id = np.zeros(nccl_id_size, np.byte)
+            check_call(_LIB.MXNCCLGetUniqueId(
+                cls.nccl_id.ctypes.data_as(ctypes.c_void_p)))
+            global_comm = MPI.COMM_WORLD
+            rank = global_comm.rank
+            color = rank / num_gpus
+            comm = global_comm.Split(color, rank)
+            comm.Bcast([cls.nccl_id, nccl_id_size, MPI.BYTE], root=0)
+            cls.num_gpus = num_gpus
+            cls.rank = rank % num_gpus
+            cls.root_rank = root_rank % num_gpus
+        assert num_gpus == cls.num_gpus
+
+
+class POS_Trainer(mx.gluon.Trainer):
+    def __init__(self, params, optimizer, optimizer_params=None,
+                 gradient_predivide_factor=1.0, prefix=None, partition_gradients = False):
+
+        self._world_size = size()
+        self._world_rank = rank()
+
+        self._partition_gradients = partition_gradients
+
+        self._all_params = []
+        self._all_param2idx = {}
+        self._all_params_with_names = params
+        param_list = []
+        if isinstance(params, (dict, OrderedDict)):
+            for key in sorted(list(params.keys())):
+                param_list.append(params[key])
+            params = param_list
+        if not isinstance(params, (list, tuple)):
+            raise ValueError(
+                "First argument must be a list or dict of Parameters, " \
+                "got %s." % (type(params)))
+        for i, param in enumerate(params):
+            if not isinstance(param, Parameter):
+                raise ValueError(
+                    "First argument must be a list or dict of Parameters, " \
+                    "got list of %s." % (type(param)))
+            if param._uuid in self._all_param2idx:
+                # Shared parameters have same uuid; only need to store one of the shared versions
+                continue
+            self._all_param2idx[param._uuid] = i
+            self._all_params.append(param)
+        self._partition_params, self._param2rank = self._partition_parameters(self._all_params)
+        self._own_part = self._partition_params[self._world_rank]
+        super(POS_Trainer, self).__init__(
+            self._own_part, optimizer, optimizer_params=optimizer_params, kvstore=None)
+        self._prefix = prefix if prefix else ""
+        self._scale = gradient_predivide_factor / size()
+        self._gradient_predivide_factor = gradient_predivide_factor
+
+
+
+    def _partition_parameters(self, params):
+        """
+        partition all the parameters by their size and try to average them.
+        """
+        world_size = self._world_size
+        ## list for rank each would be
+        partition_params = [[] for _ in range(world_size)]
+        param2rank = {}
+        sizes = [0 for _ in range(world_size)]
+        for param in params:
+            if param.grad_req != 'null':
+                current_rank = sizes.index(min(sizes))
+                partition_params[current_rank].append(param)
+                num = 1
+                param2rank[param._uuid] = current_rank
+                for p in param.shape:
+                    num *= p
+                sizes[current_rank] += num
+        return partition_params, param2rank
+
+    def _allreduce_grads(self):
+        """
+        rewrite allreduce here because we need to communicate using horovod.
+        Actually we should use reduce here, but since it is not available yet,
+        I use allreduce instead.
+        """
+        if not self._partition_gradients:
+            for i, param in enumerate(self._all_params):
+                if param.grad_req != 'null':
+                    allreduce_(param.list_grad()[0], average=False,
+                               name=self._prefix + str(i), priority=-i,
+                               prescale_factor=1.0 / self._gradient_predivide_factor)
+
+
+
+
+
+    def step(self, batch_size, ignore_stale_grad=False):
+        """
+        inherit from trainer, only call boardcast to make sure all parameter are consistent
+        Makes one step of parameter update.
+        Since each process main their own part, we need to brodcast after calculation
+        """
+        super(POS_Trainer, self).step(batch_size, ignore_stale_grad)
+        self._broadcast_partition_params()
+
+        if not self._kv_initialized:
+            self._init_kvstore()
+        if self._params_to_init:
+            self._init_params()
+
+    def update(self, batch_size, ignore_stale_grad=False):
+        '''
+        assert not (self._kvstore and self._update_on_kvstore), \
+            'update() when parameters are updated on kvstore ' \
+            'is not supported. Try setting `update_on_kvstore` ' \
+            'to False when creating trainer.'
+        Since each process main their own part, we need to brodcast after calculation
+        '''
+
+
+        super(POS_Trainer, self).update(batch_size, ignore_stale_grad)
+        self._broadcast_partition_params()
+
+    def _broadcast_partition_params(self):
+        """
+        This function is to broadcast parameter since each process will maintain their own part
+        """
+        for param in self._all_params:
+            broadcast_(param.data(), self._param2rank[param._uuid], name=str(self._all_param2idx[param._uuid]))
+
+    def correspond_ranks(self):
+        return self._param2rank
+
+    def generate_graph_pass_options(self):
+        #helper = _NCCLReduceHelper
+        #helper.init(size(), 0)
+        #helper2 = _NCCLReduceHelper
+        #helper2.init(size(), 0)
+        options = {}
+        for name in self._all_params_with_names:
+            type = name.split('.')[-1]
+            index = self._param2rank[self._all_params_with_names[name]._uuid]
+            new_name = self._all_params_with_names[name]._uuid.replace('-', '_') + '_' + type
+            options[new_name] = index
+
+        helper = _NCCLReduceHelper
+        helper.init(size(), 0)
+        options.update({"num_gpus": size(), "rank": rank(), "nccl_unique_id":helper.nccl_id.ctypes.data})
+        return options
+
+    def generate_backward_options(self):
+        backward_option = {"partition_grad":True, "current_rank":rank()}
+        for name in self._all_params_with_names:
+            type = name.split('.')[-1]
+            index = self._param2rank[self._all_params_with_names[name]._uuid]
+            new_name = 'ncclreduce_' + self._all_params_with_names[name]._uuid.replace('-', '_') + '_' + type + "_backward"
+            backward_option[new_name] = index
+        return backward_option
diff --git a/example/extensions/lib_reduce_gradient/test_reduce.py b/example/extensions/lib_reduce_gradient/test_reduce.py
new file mode 100644
index 000000000000..b8e40580f0f0
--- /dev/null
+++ b/example/extensions/lib_reduce_gradient/test_reduce.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=arguments-differ
+
+# This test checks if dynamic loading of library into MXNet is successful
+# and checks the end of end computation of custom operator
+
+import os, ctypes
+import mxnet as mx
+import time
+from mxnet.gluon import nn
+from mxnet import nd
+import numpy as np
+from mxnet.lr_scheduler import PolyScheduler
+from mxnet import np, npx
+from pos_trainer import POS_Trainer
+try:
+    import horovod.mxnet as hvd
+except ImportError:
+    pass
+from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle
+
+# load library
+if (os.name=='posix'):
+    path = os.path.abspath('add_reduce_op_lib.so')
+    mx.library.load(path)
+elif (os.name=='nt'):
+    path = os.path.abspath('add_reduce_op_lib.dll')
+    mx.library.load(path)
+
+
+class Easynet(nn.HybridBlock):
+    def __init__(self, n):
+        super().__init__()
+        self.ls = nn.HybridSequential()
+        for i in range(n):
+            self.ls.add(nn.Dense(in_units=2, units=2, flatten=False))
+
+
+
+    def forward(self, input):
+        input = self.ls(input)
+        return input
+
+
+def test_model():
+    from mxnet import gluon
+    from mxnet.gluon import Block, nn, HybridBlock
+    from mxnet import init
+
+
+
+    hvd.init()
+    rank = hvd.rank()
+    size = hvd.size()
+    ctx = mx.gpu(rank)
+
+    np.random.seed(1234 + 10 * rank)
+    mx.random.seed(1234 + 10 * rank)
+
+
+    number = 2
+    model = Easynet(number)
+
+    if rank == 0:
+        for i in range(number):
+            model.ls[i].weight.initialize(init=init.One(), ctx=ctx)
+            model.ls[i].bias.initialize(init=init.One(), ctx=ctx)
+    else:
+        for i in range(number):
+            model.ls[i].weight.initialize(init=init.Zero(), ctx=ctx)
+            model.ls[i].bias.initialize(init=init.Zero(), ctx=ctx)
+
+    model.hybridize()
+
+    params = model.collect_params()
+    lr_scheduler = PolyScheduler(max_update=1,
+                                 base_lr=1e-3,
+                                 warmup_begin_lr=0.0,
+                                 pwr=1,
+                                 final_lr=0.0,
+                                 warmup_steps=0,
+                                 warmup_mode='linear')
+    optimizer_params = {'learning_rate': 1e-3,
+                        'wd': 1e-2,
+                        'lr_scheduler': lr_scheduler}
+    trainer = POS_Trainer(params, "adam", optimizer_params)
+
+    options = trainer.generate_graph_pass_options()
+    backward_options = trainer.generate_backward_options()
+    x = np.ones((1,2), ctx = ctx)
+    label = np.ones((2, ), ctx = ctx) * rank
+    #print(options)
+
+    loss_function = gluon.loss.L2Loss()
+
+    model.optimize_for(x, backend = "add_reduce_op", **options)
+    for i in range(1):
+        with mx.autograd.record():
+            out = model(x)
+            loss = loss_function(out, label).mean() / size
+            loss.backward(backward_option = backward_options)
+            mx.npx.waitall()
+            mx.nd.waitall()
+
+    for name in params:
+        print(name, params[name].list_grad()[0])
+    print('finish')
+
+
+test_model()

From 5d9d2f327b31711d3164fb2bfac9ccbc050e87d8 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Fri, 21 Jan 2022 09:23:02 +0000
Subject: [PATCH 17/22] add example

---
 example/extensions/lib_reduce_gradient/README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md
index 85cfb3d7b16f..43d524e01ff3 100644
--- a/example/extensions/lib_reduce_gradient/README.md
+++ b/example/extensions/lib_reduce_gradient/README.md
@@ -52,7 +52,7 @@ trainer = POS_Trainer(params_dict, "adam", optimizer_params)
 Then trainer can generate corresponding options like:
  ```python
 options = trainer.generate_graph_pass_options()
-backward_options = trainer.generate_backward_options()]
+backward_options = trainer.generate_backward_options()
 ```
 ### modify graph
 Before forward, we use 
@@ -63,4 +63,12 @@ to insert reduce operation into graphs.
 Then we call backward option as 
  ```python
 loss.backward(backward_option = backward_options)
-```
\ No newline at end of file
+```
+###Simple Example
+Please see `test_reduce.py` 
+
+###Current problem
+1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address 
+problem in complex model like Bert-Base.
+2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory 
+consumption.
\ No newline at end of file

From 9eb616ced66fa959b74be65277f832ad65ce044b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-21-1-1.us-west-2.compute.internal>
Date: Fri, 21 Jan 2022 09:25:06 +0000
Subject: [PATCH 18/22] add annotations

---
 example/extensions/lib_reduce_gradient/pos_trainer.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/example/extensions/lib_reduce_gradient/pos_trainer.py b/example/extensions/lib_reduce_gradient/pos_trainer.py
index 357289a8ab07..eb22e2948973 100644
--- a/example/extensions/lib_reduce_gradient/pos_trainer.py
+++ b/example/extensions/lib_reduce_gradient/pos_trainer.py
@@ -171,14 +171,8 @@ def _broadcast_partition_params(self):
         for param in self._all_params:
             broadcast_(param.data(), self._param2rank[param._uuid], name=str(self._all_param2idx[param._uuid]))
 
-    def correspond_ranks(self):
-        return self._param2rank
-
     def generate_graph_pass_options(self):
-        #helper = _NCCLReduceHelper
-        #helper.init(size(), 0)
-        #helper2 = _NCCLReduceHelper
-        #helper2.init(size(), 0)
+        #Generate options for graph pass, key is parameter name and value is its rank
         options = {}
         for name in self._all_params_with_names:
             type = name.split('.')[-1]
@@ -192,6 +186,7 @@ def generate_graph_pass_options(self):
         return options
 
     def generate_backward_options(self):
+        #generate backward option for deleting, key is the node name and value is its corresponding rank
         backward_option = {"partition_grad":True, "current_rank":rank()}
         for name in self._all_params_with_names:
             type = name.split('.')[-1]

From cd349ce34498ec8037c3a467764db9c26a5aa5a1 Mon Sep 17 00:00:00 2001
From: xinyual <74362153+xinyual@users.noreply.github.com>
Date: Fri, 21 Jan 2022 22:35:08 +0800
Subject: [PATCH 19/22] add picture

---
 .../lib_reduce_gradient/addreduce.png           | Bin 0 -> 12062 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 example/extensions/lib_reduce_gradient/addreduce.png

diff --git a/example/extensions/lib_reduce_gradient/addreduce.png b/example/extensions/lib_reduce_gradient/addreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..40338b972d454e585b9b5589f8005bddcd8a0048
GIT binary patch
literal 12062
zcmdsdc|4Tu+xInu>?&J?y6bMStC);1Wot)vLUP-UC4-DGDumG8Lb6S$WSfMsHAW;w
zF?NQrq?$2fz0HheEYEf6_q@;VeV_O9{_+0zzJJW;`pkJ9*LfW0aUSP+e9!ZHBwAcC
z6BUvZ0suht{5j*R0Kh{B0GQEEez3*chr1tqz(TH?odwD}_f3O;c6gn+d<Fn2(uCLD
z_`v^yLFXJo06_Xr=m$H0_0R$Us8^mhK4X2$WtJNA;K^XreDJ1s^bwSg<I6Xa+EOvb
zzArDN1PQAO!~88@+TQ+q_une^A~T2Aj|&EGo%?8W_fmGB{efa*5n)wr!v`5IQ-1he
z<qGm`$QgT`N0xVf_|-_-5T0I3Hnpp_tGQYlG<Q2U2d^8+`TDhU=IaSsMDFrM1SOM3
zi{R&e_3Fx{oetRtnu+~Of(!#6_x>Mx;F?@WoK!uX;4f4-=Y0b+(rZ*^K6;3XIQ}cZ
zfvBN^A^NtRukl&FHxkJ@ODN;OfWl;~7{C`Y-JUh4ClE7Z`(R%%uaN+C;`6Z@@0BX~
z(UX(u0Z~0_Bb%j86;rouS&ki5S4LFrx86Ecajp~Q=elXWZJ3g1YXGUdlMYw_@PN@E
zt~v1s|DZw;FZ0uA58mLrLCvgBIFo8PUF1C;C%^SYjT-b(O;Ik?{k3anCk&t)@*sdi
zb00J9sNX{csE;NSM&HeL8S^gtC#CAXoqRE~FYWY3<(dNw*f@(6z#4DRTj8TJ1f_6Z
zW?KDAr!8H%@&*TVDxZ;feOI9F^Bj4Ath56SoFk99%TOnB5~vfuPIPNhJMu%sL=<jv
zuQXcVMS=b1kjUqT`OQx#?%W;+{OUgnU1q#!MH8A!?VPu*Our!TPzf^ZJ7t7)v!pS)
zSv2*5nNf&g`yKWXsth6TGlL)aZ05s@2;t>Dx|?qz&X3dJ;iQ<%|1&y0_HVPw`dvM+
z{Yo!|=wC%eaQ9E%GbZ*Oiw{u|nhx({d=8QU=)`6L^tTai(~5bfye!hqCp-K?i=&*B
zXt<>_3?SOjv7Mt$K+=`UYE=aTMbzD^s`bIvq5J^r%7NcHd8>~+?|hqASzYC1J!s-5
z-Yw086qi!Roq%g%-_=HqR5Vt3RH_wMGQ<FSO7jl%FMfWD!*cmGe@%6J>sj5)Cn+Ci
zi36Ad30O{DEbK{Q!0ADvT7*b9)5emTF&gnOiw?sHJQClw-~Z&&lYoG<Y84aZ`b3u#
zpn>%&)7u8NpX+Vr-1zn8eR8>5H-(p7!ULc$_WbVhn~L4o*W5}8zl%xs(jqAygF66v
z?q9#HKdAr4Ufd|T`k;wwy|nWl02ld4WLs4!7=xMpy{T3z|8x5TFto4C%B<e)<`jH3
z87%bU1<+<Ue=F**NW2)Z`s*K&z>4FiZYnnrjyu)e^w^LBST0w8OAPA^bk>=)ye>eQ
z{heHXe$du4gw|O`;=ztxRNnU3@$J<z@hZ`~!SYAGAEhcXKl-Va^ZQ%fWiCVtYWQ92
zx6BhqR2V6Zw^wgQ3De45Nx3L@Cfl*&LG$8prB`KMBxmMJdP-VaL&VZAj}hkUF?m$M
z(%fj>5|NF_OX@_0E_6-edn2mw^iz(Wt@GyxBdW^MyYdWU+O>W3`W<MTWf(T|+`p`G
zs-7xzs%L-sZB6}@-w~#r?o+;Nd}!g11qB5TLE1ts((lzu{p9z|;qxe4jQ*10$efn>
z$$^S0VtR6O15Z_FLDbAXiYd#5iHal+B1#??i5c{F;U9Bsa%PBU>r#C}cVNQ~?ICM8
zN0gvLcwOt8!FUU(5B;l@BfMVOtpj<v%71Ed&E|^72kjpM-yf!{Am!uSN#O|7*dam%
zcCJ^euK<M>V-648WNN{M28s&=dFuxpWZ{U{IYw=qm@mM#f{s)0g>bRW+UQVVVpOeU
zxlT&2(`@9yA2QCp>y+x&l`})HB~UPDH~Yzf_~fD$?!8I-QoB)tUB%S&Q1pDw3bPfT
zL13p%$xM;bgY;CwTogtOm4K>)nZE;2U6?*zD}g$ZbPXX$XH*b?5*(_<{LrY`he0p*
zh7HAFLy7R+<d#_@H-+Zq13U=q@VP&>r(F5jzVA2HDK3(AXvD|X_`-W`c1A|e`B?Ti
z+xf`XFy}MlhOU?F1^P)2z!4npvy?+T;98f-Xv8|QPs5(ES{szqStsA6Bn9m57k#Vc
zdr&f9LteQBm`m{6u*L;SldDPJSot?v+-GEG`OykS+>vL&MDes28yoTl)mIMa&#-I!
z4*h$-<YtYtK=AXVx23(RV)K|%F#e7eDt}fsR{2qUulMMh)nVnKczufFTyG;HK3&D$
zFUC+JVbZaxOzD}LMOM&rlPwvc?nVEc?&HM4o6hb92jBp?{16XDipDRkJt0)fX)TO5
zSiibRs%2Ttrx2=hkCw@7C13n={f|Dv+8^)SDPQ?GCI#Q~jG8qLa4V~F29u)p#<`+5
zXLeFdLwmF~qOi)ffitV^p{DoF$O2@?-7sQO2yb#@mE?7`LQ0j3^h2pqslpAz7rv1N
z4@=n5_>|_i>B(`S+{!X~HpP@_jW!_GfbqiTrT03WtAE}RA3}-D>F7KXcB$boad4B6
zGCQ9ThEqQI<Fe8CjaiewFIG<8UW>^EGt}()KX7)z0oS`TY=@kv49`EyDi!_XT=*6|
zSwHE4Qwf3HT9QRkhy*_WvP#VW$=cbBJDNDZ2-(J28S(Dm4cFsX<?hQ<yhfuE;()h$
zBG#p~F1A&?*}xE|(xft~Z0}SN)f}WM)FMRIIb_mmm^;R;B9UCyF~eHTH)x_v++fA5
zjd7c3mw@#bl<TqaC+|9RQv2~@09kZ;2P^5DsA{x?4x<bVy)k;w&Sa9gY^sTyi~i8&
zTUkR1Ckq{lp~%-w-DW5RZ-oA{i)<OXte`vJ&aof_ZnT_Wmw;QD(Z9TT$JhUUvgrB5
zIOmwJ82>2QqS?b&djpVm6E%ZcZVJ|0p0I3eYH{Sx_N;&pWc#XbM{n%4ho}GEKzGt@
zvgPe1t=-{?l*l;0xKMWS$hY(|o_dmtVL$J@!1pk4d6rypOM9PkaS+qr;a$hL4$ol_
zUCl?7ir%#v6;!)U<Rud1!A(9MOelSLA%LuQeg{jrQ&d%`SS~eRqHLlxFFLKh8SHc5
z7|YjLHdk#$4&~m~OK?jIYk)adrF7z>qy{z)z=h_*41|Av`?cpVj<-`^bJMEkuay_z
zZdQ04O6iUOKz(YA#D5=c+Rg6<^ao9S60NyK{cv{cr$y+XHvQk;gE_yL34w#AwSsO!
z!A_44<h#S#08p=yxV`?1=^RwsD3s$l2Y==xw%ET4*d+jfcG?Ydeg-Di{&fg1G?5R~
z+-_Uy4m;GEU$5>Md9*|iz`-90(w7w?a>4Xx-3g)vVeLMT$@&6^95DYrzJw&9U*3lH
z#eZx!7yz!~^7839=~BbJsEb#X?;e~AxHUM%cH{*xe`aFUlM}==!L-{Ru}mGWwQ)6^
z<Tq7JAsjm<*+0u^Nc9Tk16a1dGm&+Fbxq7qeqYkiYuO%~*C1-}3f=KP@|1NZ&s$87
zmE2sun;Jv+NH#Ar`jHSF+L((k8J#0UEhm;L%+O}k5j6oj`=5oxR|Ck#N5a1a)U&7_
zbPHEXGq}Arv@+8gNJ@IH+fe5`Ka~8jUkD3JPN|;6=?IaHUR{=5)6?(`@o;X93p!)#
zXUw1<5AfOQ(YDio!}|ZQx<()AiPw|l5Mz%s%h8vpqvc#L|NCR4Ms^G}W;IZw&a8`+
zLlgjh92v%{gCGZZjGkH@n$D;^c8#6Qr!RuiF?StkM0}c9J2<ShoQLtDe48b$q-~+I
zIE{={efB*2PXK%RkOCQmKftSABXZ+LOGrTL=Up{H-_F!r-)ITwUhsj}`ZTLi*9x5o
z8^iwl>XKaiG6y_H9%Q$390<Xxdhv?82fuEu#?04~q9$k{MEr4Z8ms=MOZ5E@l`X11
z`|6U$DS-ZT5Ko*+y?pKF^1f5F_;CGSwzH?E4QeLeW|1(iw&|$TGjl7HZCuQ4P%>bO
z8LnL*m^W|?Hv=64>oNUcG(5eH%y>fMZwy+M89w#xwacalkpNg+kz?d@u1xiU0#^62
z>c_f7OEpTgmm8bESpyf$Pc#&%t$*pl4`zVDTBZr3vO)r`=U7=C`}B){;MC=SS3DMc
z1INxp3IIgQTCDoUb9sC51HL+4x^OF4kr4CA5IYUu4}JUcA^5F(z(gilM*4sPQmTk(
zWnp(+0E9m;c<Z-cyxl9l(!*mBHhNkJpxUfp=Q=_l@r?9OKL5Pr!_Cb6vfS7Ke7*99
zr(OeJ7wgo0Nr0saE8?jR@tuFxEd+SJu7c_R2{j2dL|C`113=CJ9PJ|}{T6N)R(ZlR
z0-*1M>6>AUb;M8dSV)CczNZ)fO<#ABLp)<Pf5Mc2Vx+?EHRb`Mb>RB1o%E<?-W0$(
zj+)ToDh|8u2{xd&R^%N55x|q2xt2+M0-$+f2g_h8))Xi{JG(DNe+PD8CtYyj*lnQr
zUaBDAor=xfnbLizkhe$(aTvg9@nFtwTT-E8Zhnm)7&S(ki$IqA>AQ5xU<Yu?1Zn<I
zk@3M_04Q$G6@T$MjR)9C|ME^lm3oN}u)$$FzZC#Tq`4v(-bK4ZFL-|zhNtlq9p}No
z7LW%4oBS-(Pz^EoSKcCJ9?TidKuxT2h5WC&2uY;WF0$~wh`TWBM5<;N@f1)Tk9E=H
zAP;V`gmFSFJEOT)n1%KZ7SH!dMWFao{OTJ!Vu?%Sx4QY5@wzgn9H+XTIz#RWn!%zV
z&0rz+Oa5Tq>VuGu487%)z)JxoQ2;JsC!Mc#<q}|1z$sU0Sy#Un6woF{Ggv)OGx%p0
z&0uqpQsXwu3_ARCV?KP&Na^{gT^qRqz;*bRi6$ip$ia%_xWYHg%$+trh-Y(K46{Xs
zl-4I$3)J(g)bY@{WzdZ7?zQv4b&hYDaMD*r@4`2-Fq}}|-SWDgeqA^BzAUk*z49S*
zy?J4CH8GKxra9a03PbiW+NU<=5k72ZzoXO>7~B%{8Z(>xP=;M(mcS)0+zE-<q&;r2
zzwx48#)P&u<8`D<6i_IxK73R-61OYlC2tWA8>f<8a<s^Nv5EI_tb_Zf7Kt3Yh=mUr
z4*;iMExp5|r!rFv*ndFoS3aSWr@lRMs%O4&q%JKQjk(_0;4LlN&j(Pf{mxkEoF?x?
zKJ{*QsE%CDThHsHVoE}kHWyfLNAdqOE?*v)9we0y5)9Q{-rJXSW*_rlGy3QL`Pso%
zLsWuC`BI!#)Bv(!1^=oLil6qnS;E-ZPKuAEc|BvmOq%?S%^alC=|x$3;Q^d_s~xO2
zEZwCDoP5CMcv8<K7@o>gC9aoCXs1;f=j{}@heYU}Kii)U*I(I1e%+&?e_N3rKfP6x
zZw+<c)3CL9@>6z=a=lyVcq8WJl6Eh)Jm`%}C`vD8bN!gn^}gOlkB)TzoTT{AGtj93
zS)7N+HSbt&DxR9IrMS`-2Xpiz2fk&&HJo1OO{u*q?Zb??v|V*xC-E1l3w_<x*<39g
z)~zn%3<g(?(N9-&eI``i$SWb?SGde<M1px+w^o_UB#F1!ZEdUUsDh&R4qR34ts6IY
zCZ4s}m4xZBp=HOx^mRU(%u6-R5;>OVs+JvT_(2ZhRh-L)KL}UzI=v38KnICMDFd}y
zzs$1U$gK>4ygSt-myr<-5!2A_)pb-=S`MjS*Cw#voOT>jwlw60L$StI2L0B%oY}pc
zuq!ziX{@dn814xD#Bx`xZ|$n9=!Fz8P>;TJ#9}norC%0x>RL%yTU6ywFcOfK7F82I
zl44SdYS$7Z534<7cvWHpCUs0d^ozgA-#;r5kf&6b!k{O(^|#LNlF4ad3^B5cmoMF^
z*s23{mR{lPjnAuaJ<`qDBqc}xeLsqGv?goo&w}0C6Ef5Q|CV*uNs666KkiDpf7>1@
z!)MGE;U2K4GO(3ARUISS*eYJQHmr){;*BU8VRJ*eCd?b_6Xj7Y5*lG&ZD|ks?ecQc
zhZpv!944fd1+w<Wlk8XUm;Lv)r^GA@eFx)1$h=oNddB^NdaG)2WU7_M8qB%RY9B`o
zE*yJ8kgnr0Lc=e+CRDkqUV`@&m4m3M`E7qizpgvmX1<=Y)OL>>nmFsul#hzvf&RFl
z;bI9z`-`ZhtkznsD$8!sf`+hujaSQ(Vl5A4#P%eL%4-SzUEwB8Y2)!x#VV62YeO1*
zpT+|YqsM}7U64iFWDH5jlo*wS_&V1bh_Hnio4W4YAxueL^x6%hYiBDwj>0!vRUSaI
zQ78{z<L=xFkaE6T{-;0%w%kLm*iBMZ>h^}btkga=3r1*VmT`67m=m_&92Ll{&O6&!
zHpmHkT$CQilq{T$I`?8=EAH$_NwiM%HI~8YSERIP9X&5qe?MXj5s|P}=bKYLOvBr4
zphad?5O%i?`mW3$`j`GxNPl*sAT<-d_h$07XlBh}{GS3scPbvf#!e?#5N*Bt32qwI
zJcQ<#18?Y|L&B&NE#Aa|>GzwMxL|ow*=!ZDyUsr+dX3v!6h3NbTTA2%+j>@^$Tp?*
zF&CZ{lMG}=3}(*QEfA)Lt5=yOq3^oRjjZKW+#9*LJ#)c->C+)UyOla>?+xm(Xk7of
zX7XZ0l3WJgcl5a<6M%98-_;*kMlJclEV>{MBB^u$0X+6EM}uo>zXPrFsr#*Dsf~Aw
zqao~MUhMhm0a)#qvuousx62xL0K4Bs)`XCj^m^|GfHw_-!0GCA*fVI%=X#!N_wojG
zcVskLlTHE8to`<4PeDEE^73xCyRMsTd<U*yE%N~6dT+5HAOYsl!b$VZzc5W|nAN`5
z3>g97v9B60IomS~+MXU+5Eg05^_G&`TD&idAMd#~4UNsUl_y}Y5OB*|O4FT#EPwz@
zlpyfqfg&1^vH<mFLb-R1(VLg67?kWHdsl2f)XITI9<`JaqpLq5O{6~unyjE!hjU)#
zXrPW#8w{DzJvs+Wyr~vJ(pVoY&myp+^fy^h&Uxtc<6Vf?e-6OYYinxj6KMXAsXE%>
zz~Tpd)IZY2V}|NNXuk(h#;X|#CemLQ1jr#u8*~=VsV^Tq=*5EmM<={;{Q=sCnd@-P
zRYWH{ZvF~n$nXL1en$aZB($`1vz{GoJB8l(6FJBOej*G*XMB6w32I|~Op)&-Zf*4P
z26ne|$KVlafJ9NX1akRUq2DD02pPzG#@9V7+VYt)BF?;kai60g4!+&vDjT1ypOwrg
z_XDu^9aUhg)U)-!&O-xtt_jD7D6NrM3t1S;9Gh^!8#izjAhvC*dmquu8^rjTwpHsA
z0o*UCmP8sr-IYgvPyCXl=6>c>Ykmc-_ylm$(oYS$4*A!)CeT$YzP>1RdE6C$5;$q=
zr-i+}xcwq3vPXY9ZR{Ucs<7*781WrIzE<`V_y^L2rlM+emt(~*3AaUQR3kQ+DA)E)
z%;!>Dz%zf~Cb01Vl6m?{?LB*-3f$^5;)E;fwMwM0nZ?zL$gOR?;mcGXR$+;lQ_X-%
z=wqtpR`S+L*V=C2S@FOfV54<g$zAa);$I^&1Qd(xJQ1N6=o<G$mUtzs{LWxEr_4z}
z{al<(Hh0ErY4-BgXR~Z>uVK#VaABRVX{7x;JNq5?>{oz-Z!jfIZ`mxSZ{&@v_P{lo
ztb9ox+H*$ZzAqPO>T{IZuLcR3KS)I^N0MlxZ3a5}*TWsrstQX2hDCXWCmviBm?)`M
zM`BJY=G67TSRdnrJ5ggZqel#fHy32|1+BTazwhZp4QWP{s#k=i1y2JY7<%|BZ+#sB
zqT@g!=XuEZR%1v@OQ)L4%&3>a)M%Tb!QPKHn-k*MS@5p9pB*_)m7nS&=W|A1o6YRq
z1?nqK!KM~Mb6`{8rVbsxG=SnnZ0e8grb2nMIwgUk!#oI~Sa2B^{V~@DgWfhzfH7M9
zIeu-raUal+M4G*T+-&n#j7i~Lo+5rw{L3d#9;6NV49r>prVs~t*!u74C{SN}GB)+>
z6u&ZHQ@BX~*Z<~iID9{N8W{mz5C}m&k{Sjt<U#2;Y-*21;3G^=riucxf2bC%A18@i
z1f`VMZhkj!1d7k)>|qp6Xe4eoA~;7}*+*f>rwbXzxP3e6N7a4_^IMn<oDT*;IC$nt
zipQp2?Ui}bSOnmFXJP|RtMD9E<Uw3Wg9fLZD+QC>15<d~lp%>Ne&^%8H$zMkwEpP}
zjbomt01FdF_QbCU+^L=PB1z`??~6k~F=Voq1ldyek`X_y{p1hxuH@^Vl^&||Ae7IG
zpS-HE8$#+P>)N}{$%Fm0Ui$!^Wx*XdV(&Rf4;0+iPZ7%Ays<0^a;zK(AmLX)Cr7&M
znCgi=2{s|PcY0RPK#?5v6keVV+QU+UDJ*RJ`je_1Qf?<5iG@r*-Vo~&@`{%(&V!Kn
zAo&+ioaz>X+yXn`U$N4_7FPyOubb@pECW;Uz5|IP5GvU9UF2A4XuTF!b_$6`!pM<3
z(AVQ{Jrlqd_ofsGF;1W0p{wyA$ovqPVmwQgrt5?M4tweV7NI)5QlNoj`*FMqH~~Rq
z|C<dfFFtrTFVzewfcOZFDkq*B93w`K0{euN0GGrzoI>VR)@^ptck>`LFMi!60;n^K
z=pG5J%)znXw!`Xm+)ugT;ZgGsxZkPb4NhjCpzBf(6TBUGUe%e+4T?;2`Aq0HK<)zV
zR)1U9vo~1~*WTy$B9|3V9hXu5e!4Map$$4h!!^-l(bVjl$?@h5A1?0+m>xJn1xb~9
z@M^%z1YdoLS@rD>Tf70h<niRrDMpotmDVuK_<BIo>pz0~o*+NA`1x!tOCBCjyYbrP
z=3S;(ZEUD}z_guGaNY<9J)k#!zPnKvzWLaxe)7K4>6464S4i18yyy196zInTp7CI(
zt$K>XiT$jqBc1OV(s(_3x`^SAbX3e?xmqtP^w(HXaSz(9AJcjLT|$KkSTPm=1__Ws
z%_@jQpHz1?a+RZ&EEUL%2e}SKkBXKWGSUM!Xx_DRp540ABKmX#(ogTOfc@G#&^R-H
zTV7~(+WiSCQgQ5mCw;8WLUhhPS;P-c&g~a2^7}<W`GoE1w(^QoO3K5mW(@xmKE9DM
z!l52{=d$!uFE{Vq<=p?GfX>0d@n|viA*fe;E_U=&L5Z66>sIIn$2Mx?p8USOdArlO
z4&bfObmQ9%V_P_G9E?UW@OY&GFm)M+A<E@Yn)AxCe^@%m4P{9`;yQyjL!rWhGe-&w
z=0Vvj^{ZeK1jUTy&gQ!?BwOmF;#hjVp?}p1uO(u|et3f??i&Hu0h8CkW5-1Oe5_B7
znuPML#euKBo~<{u!<1y5gXp7fA42-embL<vMUn|TwYU>$2nN1E_Z~B;9#rmxETn<o
z*ZgFTo1vGVPfbyI@_=H#nz1V+bQM{nad(0LDkLLa6R0<kLGUM-gxXSL--BUhP?2Ri
zn~W6K-VMF~Z~RfjK^$*XenfX@Wzl$Y^mO%BD|lCR!3dtq97{hPPiGew&0U;3{rNKM
zvlL9>+`V7?iu(+{Kq6DIi+P^#nl{SQKcuL}_{?(bd(fC2&@I~Wo~pORXA#Qt`F{3Q
z9OOnZb_~=acpY?u$5Q)KcqL8sBjwQ_>3l}+!$#k+r=dZ(LoW-DNHN{KE5Zuzn+)b3
zImH_RF3zZrptg43*3W-C=+B<2&iCuqhW%GNo$WW@7yY-PEXpwH8v1g*4sH;flsw3a
zcL<P1RU5ySJE>;NZ+69FP&#Dsy9wN@b6+@TW_az<8{6`0nW)E)^dmWcy`*1V_xA#+
zR2B?k11lrNvO~b7XEu@wk{plA)l6Z*WA7sz>Y_I=QPY=qu;#kG9Kyh06o)P-GpTIi
ziGZM9<0|UWo88(dioPYO{f%U!)IKvaGiqm!9`j=0AGq+gpB)*3bO3V~BI|~W6^>AQ
z%FIniQXm>d!Bx)z{6-bmvA(+WI&M)Dr{e9aCz@>OK*UrxVA#G<vp+a;>II4sdgaYk
zsV`iCs<eUWQ__yrmkc))*B2KT{q;s%<}a6E%cqK5Y}*#<tMH5)G~3~K9vx4XnAPdS
z7=<oZ!|=J$83enR(Nomv7}#c6F&nxWX6V%afuRtsehl=ruFBMCQ~hB;vnKUHe{^h&
z4#iWlj{Y3HV7ms5NJ47G7-spRi<WRM&csksSykCz*;)>TUygp;rz6x5wr>`Xl6+L1
zlUkrzj53}NRn&{;SV`z0a^ps26`YuA7EVEgDW4pk@=f3Ulq)pCQZmDxX@Z{(**%M&
zo!wW)n2G6p^|;G6FLO(GjRvjV&klQO`~%uBY@G+&pyo!~s&^Jd?mc&HAUxANr3|c1
z4vPOhi71fp8S#!Z97^IWw&0IWtifN!vv>OBM8`@fXxNH(Yq_^AG8IeGcq$lw;Ww`P
zOXAcX`Z<-DAB@g%AAG$$8%>a&Na4YhV17U=_5n0M&eCL3@BwJ}ReQ)Sxz`Pi*YHu%
zwGU7Qp>8YIP3#(BrW<jF(wT%((#!R!>J*<#AylvtX)!J&dj%Gvue5WLow((&ijOKY
z7%}Ctg~74x0z<3l&X)t~w0w{o>9~|oD;txk1jxsI2oONax6CcS<+i0Lbe(jBKgI?S
ze?>dX3G!k}5N)Z`;2D$)vh#Aq;B|X`yavbXN}@L5*iO>Pq@(>^(m$8~7bYuZGp15&
zC>pS~w-$_R(VVGH<ooUk(<cz)Zw_%P*rqVufdK9qJ41J)YyPKeh=fU1EjK6mlE<!I
zTWgo#1|vi7qW=Jc;kP#P6!)3r{R2a<aQ9)(d4U&PHH*|Qo7fNdNGF8r+k#;sSM8zG
z*D8L$LVmB5@&<CB@Pmp4iYksN!<(gTV&aB*1#>Q?60j(^6`}ropW!P;zQ?AP^^rY<
z>M}-Y-rCIVO|SlGFL|c{kkQ|71`!~A3l4z~v}a63^9NqP0=2sO9*g>btDlpPY<wiH
z?Aw=q3>|;Q5q_xna^FoU!;cqKH9zr$-nvFoMC4NVP#|?hW+=hKPUn$GNgQ@yVPB&o
z{%2U~+hxjGf`ZWqw~c(<wFHrzV?Ht*854CR59RM*2yT@7Y(RdhBECzh!Mp18wO9Nu
zqLp&C?0fd}54<C_+t+T8F@6@IBHHZ~myX$sXGf;(2gq3q$!7Qs{%al2yDMg{abEtI
z$A^XmpbC~ZpIzvV)QgW^WA`@J2RLW+zjVk!WlBx|sO*oh5g$3cehr&Reh3$hO0JeM
zQ7?;MvPq!`OQAtr^w|<(`{Im+dxUOBONbg3@)Qy(a>L5(%<soJ5$)bq0Rd<c;V;S*
zvw1bFuOZ(Rz4C!!nPfg!!&1l3DHSCCA?@jBlyQ(H9@N<kJ<6g)GAApR*QS-MLWPZ@
zN)X(<N_uERUK!?jY0Vb{v#otim*}q+V-Ez}*U;m^%!6C;wjUPn7Bs)O^SU{2<-pYr
z$s4E~OKcUYibxP#opfTod=WG)%v);SA?0$p5<4g0Q-I)d@3D6oUT0P-HJXrW?LQBO
zMMvFeMkEZr9t4-yTFv6I?3!W$iI5SycKFMgcPZ{5XNCdy+aE<B19?$)pVV64MV09L
zELZOO!iaa4riIEx{KzWX;+Cy~|IvgbQq--}t_7jdL22LulqQF&>)~BX#VDNwZE(|Y
z&0EQPx<wgeRHCYs$n8;Poel5wpUgUKEnM1QV8?vdMR(ws#i!tJ?|rcJ|GnT`cA(>Y
z*tSj@;kQN?ZZ(=R9r@5~TP}=BNJ->l*@lXxP2<IDKzPDx2TLH(!ttN-QKq{(n(DQ}
zr`w$%+e;SV64>&RZT`!wLLRYkKw4Uw-76#3a-Z*$rbLYpeUiiIU#>~x^Dn^d6Ek1D
zTTTxJnZXp=08<moy#XWI<ZLkaY*jK?l_*vO?}>_A-c=xPf1*3A;GD_eTkqM}H5(Xl
za77+Gk3o?sc(<TJVpagjH@IMGy5;%BG*(Ll4_-*cCl2}PB|r|!V|-suFDa+kg~BPW
zBliZ~U=H&e2yZ>-8e$qWr~zhJU=xZx{3#uFf(=%!K$SZAv!;KyH^nEs^Mm~Z5_LDN
zL0;7aW`0Ja@U2|@CCi6Z!2{9QC#JC#RBZYF5_dmCwdKf1FCCCFF){txBE62~9$^YX
z%*|0!eh5X2)oU2<lF;UF<3+4ln!Z-iRZQ(7bkjt?f`@weC|DaLypdtl?>vb6mohN3
z_rCx{n(Fp=xrBnnK&D{2eFByNc@6?U5(6dnn|&3%rmzD&@oJ<-A>?jWm+?s2lVryF
zU{=}yp69dp_<m&P|B)bk2<JgOJORNTlgKn@)jo@iIA*G6DF0e47<i=*LU@_a0o>F7
zJuSEj;i4vpx1&?TW2UyzqHM%T%Bw;P>za{yL~g$nT?RZ$2td_A#pT2e6}Sbr)u8oZ
zIYRAPrD4fIGPs@n#s^Q20VPNDUrwsa4dmyBH0xePA_dm&L7Zh~{6l}Jcu8-lYuFhb
zK$R%gqKWvO0V}bPN?IU<DemwL-;3zWAXw1)tdn94!y5kug44ip1BXYNp?WFyE1DL|
zoJMwq^>-tk;w2%>2cfBFtk1oMAs~38Z~D@2nS0jvXoZF}%;mY=SG;P96xa<7PV+oG
zH!UYxH>=sE%YocN_FQ6l%$B%|lp8)$2-$j*QCp|*<v*d@v$>w9HKR2#w>ryYb-zoO
z>(Zr5V}{h^3}*S^mup`-<)J$WhD_U;Q+ZOvf#YQzguS{?XDj`u?86>hw7Z3qOS+2{
zJ9GyeHm>6Tg3c4SFb31Z0f?pqsUhSI$N^Yi(6&SfT|=jRu|rVi28zI=umu3gAw$94
z><)xFfnspRB_+W83s`_lA2+f6_00n~UZ|Y$zds&6TwDwWkz)38I|$mCesT@b8}%da
zJd#14VJ?yj78KPtN+Y|W!oqX+?^G+1rQSO&E@#36bHTEupHDVej!cNqaOK(W0amF2
zyMsYiWBCpT#CDZ=45`4@@gt@$oSA}j=-3T*@sUoaN4hz=QC$3nE}kIdU^<PW>0rf5
zD;Oxi{#^rp0aI;dw1)0mH-*P4z6-3|3xY&LSi8ku_NbrtgHJUH{_;6S?mx=LzzU`}
zfw1d3AXWelA<qon!Ly&uLsW&ww>vjWK_s)WG$b1qiN`7CjtsoX<y^cc1fGw?z$!UF
z@P1sdVU^#?#e;;);5Mby2ZDoj_GZdUM9nv$J!3__BOo7CwCvp09V7=-g6B+NKTJBl
zWm0v(h)U<TsAw8JkF0T1tH}MwRDXqV$>VS@lE+&drl7DW??s{u8|$l4{ok@*;@l}2
z85ar`p8*!ej93Ck?frp1@h>S9Tc>gGdqQ<i6w1M02~uEYRvH{&>-Y_!X|;LzqK?k-
z^$-<6GLv%BkJjwsAM-4^8S|iQ{t#}Y-nLcARndtb7rB#8zpvfN(6YOW-<<`+$*+gN
zki$nGI_V7-7d`&t(?Hi@9CJu$G$o0IpDp+a)Td!nQ|+hragURaJ1KFGd)rllA9PRS
zQwBP}6&9U1^cNE>NYeR4%+VQ01xJjz`&-_R+<$#!Z7AczB^srgU9nryu2O!5BwIv}
zDsa;wlrf5z9HW|<nLf@ioRv;OZ4zZ)b)eg#(QW_xd1cAfUnt`P%+R9Y1v#SKMl+$0
z08TXV%=U0oqxRabq=*Llc$GUlDXMV(baz#(K$U)M{5tdAkrt!7I@o&!>~znjl|lbr
z@&h)y<lbzW+VY6zlmMEfU*&#}iK-iK)bY&&BP&pX+`Y>$dqaaas(Xb<b4$Beppobq
zGMhBJhx>i8rMeRpvcPyfzN8SHXg@R{liOLQc&oq5^BldZ93YF4zC93iUR)*8Q!rKT
zvxRr6tDT}P^}N=_1W|sdLgI3bl+(XCM1>KY7}afm-PV16`Q!bqpAzMjKPP3nE@5&@
z+s+$CZDyFyOe*CJNCA}YqLie?yJ16$j#-kz^!!dfy^JJ^5A&g>Z%xbf^3?l5dKD;e
zb#EI_g4x|I|1PfGEptZpwtQ5@{c(3{(d)ybUGtZH`<8wr_%NMiwk{rh)mzU40AHoS
zP^-Yo@!PW(R$|_<BklYhN_KNT5B;-nZ8JiS@2BaAJu;0is3OYo>DzK})mNH;E?33r
z2-R+;n0(?`&ez{T6&&wXM5Fwe7%B2dnF;_j`R@XqDEu~sr=0J)l>3yD{A};Jf{;-6
z`xgu9(T;Xtaq;jUxPpVy7<bqJ>F@KcD<{hD!R;7C47DXw@FPdC>*x3(N|B;;6}$KK
zto-Z|$Opi60u-llc6>IXOK<bZzAx-s#hV-hJEX8`D~=QWH~F}l5^bp->-)>bZjRVh
zirMe96Kp}3V8_6K0sv3&<}ua&7RpxguNHtQu4~!EHmVv^kxlTAQoY2Dt+TXj5~~)X
z_niMPVT%+gK8MQ{Lq*|rIdx&Jh5-jB3pPV9ee)PQYCJ;tmU?XVKg|ODf;<at9{>M8
cj%RbX>erfraV~Ye;CO-aCRdEh&tAXtAM||$egFUf

literal 0
HcmV?d00001


From fc119ed63cfd67b50c21a8033f0fdd1d7cbff1c8 Mon Sep 17 00:00:00 2001
From: xinyual <74362153+xinyual@users.noreply.github.com>
Date: Fri, 21 Jan 2022 22:36:02 +0800
Subject: [PATCH 20/22] add picture

---
 example/extensions/lib_reduce_gradient/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md
index 43d524e01ff3..e1fc0efcd8b0 100644
--- a/example/extensions/lib_reduce_gradient/README.md
+++ b/example/extensions/lib_reduce_gradient/README.md
@@ -60,6 +60,7 @@ Before forward, we use
 model.optimize_for(x, backend = "add_reduce_op", **options)
 ```
 to insert reduce operation into graphs.   
+![example add reduce](addreduce.png) 
 Then we call backward option as 
  ```python
 loss.backward(backward_option = backward_options)
@@ -71,4 +72,4 @@ Please see `test_reduce.py`
 1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address 
 problem in complex model like Bert-Base.
 2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory 
-consumption.
\ No newline at end of file
+consumption.

From a5d4e3e1b0430c5313790dba2d4fedd3c8a7de05 Mon Sep 17 00:00:00 2001
From: xinyual <74362153+xinyual@users.noreply.github.com>
Date: Fri, 21 Jan 2022 22:36:26 +0800
Subject: [PATCH 21/22] Update README.md

---
 example/extensions/lib_reduce_gradient/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md
index e1fc0efcd8b0..0619f66f8b19 100644
--- a/example/extensions/lib_reduce_gradient/README.md
+++ b/example/extensions/lib_reduce_gradient/README.md
@@ -60,7 +60,8 @@ Before forward, we use
 model.optimize_for(x, backend = "add_reduce_op", **options)
 ```
 to insert reduce operation into graphs.   
-![example add reduce](addreduce.png) 
+![example add reduce](addreduce.png)   
+
 Then we call backward option as 
  ```python
 loss.backward(backward_option = backward_options)

From 4e764ea7aa78a9e66173aa3e03f2c16a69127b22 Mon Sep 17 00:00:00 2001
From: xinyual <74362153+xinyual@users.noreply.github.com>
Date: Fri, 21 Jan 2022 22:36:49 +0800
Subject: [PATCH 22/22] Update README.md

---
 example/extensions/lib_reduce_gradient/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md
index 0619f66f8b19..bcc3617cb8e2 100644
--- a/example/extensions/lib_reduce_gradient/README.md
+++ b/example/extensions/lib_reduce_gradient/README.md
@@ -66,10 +66,10 @@ Then we call backward option as
  ```python
 loss.backward(backward_option = backward_options)
 ```
-###Simple Example
+### Simple Example
 Please see `test_reduce.py` 
 
-###Current problem
+### Current problem
 1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address 
 problem in complex model like Bert-Base.
 2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory