From 5190ac0e1d820f47db2ae6ee119fb900e083348a Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Thu, 7 Sep 2023 16:47:06 +0800
Subject: [PATCH] [legacy] move nn to legacy

---
 .../offload/base_offload_module.py            |   2 +-
 colossalai/cli/benchmark/models.py            |   2 +-
 colossalai/kernel/jit/option.py               |   2 +-
 colossalai/legacy/nn/__init__.py              |   4 +
 colossalai/{ => legacy}/nn/_ops/__init__.py   |   0
 colossalai/{ => legacy}/nn/_ops/_utils.py     |   4 +-
 colossalai/{ => legacy}/nn/_ops/addmm.py      |   0
 colossalai/{ => legacy}/nn/_ops/batch_norm.py |   0
 .../{ => legacy}/nn/_ops/element_wise.py      |   0
 colossalai/{ => legacy}/nn/_ops/embedding.py  |   8 +-
 .../{ => legacy}/nn/_ops/embedding_bag.py     |   8 +-
 colossalai/{ => legacy}/nn/_ops/layernorm.py  |   5 +-
 colossalai/{ => legacy}/nn/_ops/linear.py     |   0
 colossalai/{ => legacy}/nn/_ops/loss.py       |   9 +-
 colossalai/{ => legacy}/nn/_ops/view.py       |   0
 colossalai/legacy/nn/layer/__init__.py        |   9 +
 .../{ => legacy}/nn/layer/base_layer.py       |   0
 .../nn/layer/colossalai_layer/__init__.py     |  14 +-
 .../nn/layer/colossalai_layer/_utils.py       |   0
 .../nn/layer/colossalai_layer/dropout.py      |   0
 .../nn/layer/colossalai_layer/embedding.py    | 303 +++++++++---------
 .../nn/layer/colossalai_layer/linear.py       |   2 +-
 .../layer/colossalai_layer/normalization.py   |  83 ++---
 .../legacy/nn/layer/parallel_1d/__init__.py   |  17 +
 .../nn/layer/parallel_1d/_operation.py        |   0
 .../nn/layer/parallel_1d/_utils.py            |   3 +-
 .../nn/layer/parallel_1d/layers.py            |   0
 .../nn/layer/parallel_2d/__init__.py          |  11 +-
 .../nn/layer/parallel_2d/_operation.py        |   0
 .../nn/layer/parallel_2d/_utils.py            |   0
 .../nn/layer/parallel_2d/layers.py            |   0
 .../nn/layer/parallel_2p5d/__init__.py        |  11 +-
 .../nn/layer/parallel_2p5d/_operation.py      |   0
 .../nn/layer/parallel_2p5d/_utils.py          |   0
 .../nn/layer/parallel_2p5d/layers.py          |   0
 .../nn/layer/parallel_3d/__init__.py          |  11 +-
 .../nn/layer/parallel_3d/_operation.py        |   0
 .../nn/layer/parallel_3d/_utils.py            |   0
 .../nn/layer/parallel_3d/layers.py            |   2 +-
 .../nn/layer/parallel_sequence/__init__.py    |   2 +-
 .../nn/layer/parallel_sequence/_operation.py  |   2 +-
 .../nn/layer/parallel_sequence/_utils.py      |   0
 .../nn/layer/parallel_sequence/layers.py      |   2 +-
 colossalai/legacy/nn/layer/utils/__init__.py  |  15 +
 .../{ => legacy}/nn/layer/utils/common.py     |   3 +-
 .../{ => legacy}/nn/layer/vanilla/__init__.py |   0
 .../{ => legacy}/nn/layer/vanilla/layers.py   |   0
 .../{ => legacy}/nn/layer/wrapper/__init__.py |   0
 .../nn/layer/wrapper/pipeline_wrapper.py      |   6 +-
 colossalai/legacy/nn/loss/__init__.py         |  41 +++
 colossalai/{ => legacy}/nn/loss/loss_1d.py    |   0
 colossalai/{ => legacy}/nn/loss/loss_2d.py    |   4 +-
 colossalai/{ => legacy}/nn/loss/loss_2p5d.py  |   4 +-
 colossalai/{ => legacy}/nn/loss/loss_3d.py    |   4 +-
 colossalai/{ => legacy}/nn/metric/__init__.py |  54 ++--
 colossalai/{ => legacy}/nn/metric/_utils.py   |  14 +-
 .../{ => legacy}/nn/metric/accuracy_2d.py     |   3 +-
 .../{ => legacy}/nn/metric/accuracy_2p5d.py   |   3 +-
 .../{ => legacy}/nn/metric/accuracy_3d.py     |  68 ++--
 .../{ => legacy}/nn/parallel/__init__.py      |   0
 .../{ => legacy}/nn/parallel/data_parallel.py |   0
 .../nn/parallel/layers/__init__.py            |  17 +-
 .../layers/cache_embedding/__init__.py        |   4 +-
 .../layers/cache_embedding/base_embedding.py  |   1 +
 .../layers/cache_embedding/cache_mgr.py       |  20 +-
 .../cache_embedding/cached_embedding.py       |  11 +-
 .../parallel/layers/cache_embedding/copyer.py |   4 +-
 .../cache_embedding/embedding_config.py       |   0
 .../parallel_cached_embedding.py              |   9 +-
 .../parallel_cached_embedding_tablewise.py    |  13 +-
 ..._cached_embedding_tablewise_split_cache.py |  14 +-
 .../nn/parallel/layers/colo_module.py         |   5 +-
 .../nn/parallel/layers/embedding.py           |   3 +-
 .../{ => legacy}/nn/parallel/layers/linear.py |   3 +-
 .../nn/parallel/layers/module_utils.py        |   8 +-
 .../{ => legacy}/nn/parallel/reducer.py       |   0
 colossalai/nn/__init__.py                     |   2 -
 colossalai/nn/layer/__init__.py               |   8 -
 colossalai/nn/layer/parallel_1d/__init__.py   |   7 -
 colossalai/nn/layer/utils.py                  |  14 +
 colossalai/nn/layer/utils/__init__.py         |   7 -
 colossalai/nn/loss/__init__.py                |  40 ---
 colossalai/pipeline/pipelinable.py            |  25 +-
 colossalai/pipeline/utils.py                  |  11 +-
 colossalai/tensor/dist_spec_mgr.py            |   1 -
 colossalai/utils/__init__.py                  |   4 +
 colossalai/utils/common.py                    |  19 ++
 colossalai/zero/gemini/colo_init_context.py   |   2 +-
 colossalai/zero/gemini/gemini_ddp.py          |   8 +-
 .../memory_tracer/runtime_mem_tracer.py       |   2 +-
 ...parallelize_your_training_like_Megatron.md |   2 +-
 .../train_gpt_using_hybrid_parallelism.md     |   2 +-
 .../train_vit_with_hybrid_parallelism.md      |   2 +-
 docs/source/en/basics/engine_trainer.md       |   2 +-
 ...parallelize_your_training_like_Megatron.md |   2 +-
 .../train_gpt_using_hybrid_parallelism.md     |   2 +-
 .../train_vit_with_hybrid_parallelism.md      |   2 +-
 docs/source/zh-Hans/basics/engine_trainer.md  |   2 +-
 examples/language/gpt/titans/model/embed.py   |   8 +-
 examples/language/gpt/titans/model/gpt1d.py   |   6 +-
 .../gpt/titans/model/pipeline_gpt1d.py        |   2 +-
 examples/tutorial/hybrid_parallel/train.py    |   2 +-
 .../tutorial/sequence_parallel/model/bert.py  |  60 ++--
 .../model/layers/bert_layer.py                |  24 +-
 .../components_to_test/hanging_param_model.py |   2 +-
 tests/components_to_test/inline_op_model.py   |   2 +-
 tests/components_to_test/nested_model.py      |   2 +-
 .../repeated_computed_layers.py               |   2 +-
 tests/components_to_test/simple_net.py        |   2 +-
 .../test_layers/test_1d/checks_1d/__init__.py |   0
 .../test_1d/checks_1d/check_layer_1d.py       |   2 +-
 .../test_layers/test_1d/checks_1d/common.py   |  31 +-
 .../test_layers/test_1d/test_1d.py            |   0
 .../test_layers/test_2d/checks_2d/__init__.py |   0
 .../test_2d/checks_2d/check_layer_2d.py       |  25 +-
 .../test_2d/checks_2d/check_operation_2d.py   |   8 +-
 .../test_layers/test_2d/checks_2d/common.py   |   0
 .../test_layers/test_2d/test_2d.py            |   0
 .../test_2p5d/checks_2p5d/__init__.py         |   0
 .../test_2p5d/checks_2p5d/check_layer_2p5d.py |  25 +-
 .../checks_2p5d/check_operation_2p5d.py       |   7 +-
 .../test_2p5d/checks_2p5d/common.py           |   2 +-
 .../test_layers/test_2p5d/test_2p5d.py        |   0
 .../test_layers/test_3d/checks_3d/__init__.py |   0
 .../test_3d/checks_3d/check_layer_3d.py       |   6 +-
 .../test_layers/test_3d/checks_3d/common.py   |   2 +-
 .../test_layers/test_3d/test_3d.py            |   0
 .../test_layers/test_cache_embedding.py       |   2 +-
 .../test_sequence/checks_seq/__init__.py      |   0
 .../checks_seq/check_layer_seq.py             |   2 +-
 .../test_sequence/test_sequence.py            |   5 +-
 .../test_checkpoint/test_checkpoint_1d.py     |   2 +-
 .../test_checkpoint/test_checkpoint_2d.py     |   2 +-
 .../test_checkpoint/test_checkpoint_2p5d.py   |   2 +-
 .../test_checkpoint/test_checkpoint_3d.py     |   2 +-
 135 files changed, 697 insertions(+), 553 deletions(-)
 create mode 100644 colossalai/legacy/nn/__init__.py
 rename colossalai/{ => legacy}/nn/_ops/__init__.py (100%)
 rename colossalai/{ => legacy}/nn/_ops/_utils.py (99%)
 rename colossalai/{ => legacy}/nn/_ops/addmm.py (100%)
 rename colossalai/{ => legacy}/nn/_ops/batch_norm.py (100%)
 rename colossalai/{ => legacy}/nn/_ops/element_wise.py (100%)
 rename colossalai/{ => legacy}/nn/_ops/embedding.py (98%)
 rename colossalai/{ => legacy}/nn/_ops/embedding_bag.py (97%)
 rename colossalai/{ => legacy}/nn/_ops/layernorm.py (92%)
 rename colossalai/{ => legacy}/nn/_ops/linear.py (100%)
 rename colossalai/{ => legacy}/nn/_ops/loss.py (96%)
 rename colossalai/{ => legacy}/nn/_ops/view.py (100%)
 create mode 100644 colossalai/legacy/nn/layer/__init__.py
 rename colossalai/{ => legacy}/nn/layer/base_layer.py (100%)
 rename colossalai/{ => legacy}/nn/layer/colossalai_layer/__init__.py (97%)
 rename colossalai/{ => legacy}/nn/layer/colossalai_layer/_utils.py (100%)
 rename colossalai/{ => legacy}/nn/layer/colossalai_layer/dropout.py (100%)
 rename colossalai/{ => legacy}/nn/layer/colossalai_layer/embedding.py (97%)
 rename colossalai/{ => legacy}/nn/layer/colossalai_layer/linear.py (99%)
 rename colossalai/{ => legacy}/nn/layer/colossalai_layer/normalization.py (97%)
 create mode 100644 colossalai/legacy/nn/layer/parallel_1d/__init__.py
 rename colossalai/{ => legacy}/nn/layer/parallel_1d/_operation.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_1d/_utils.py (99%)
 rename colossalai/{ => legacy}/nn/layer/parallel_1d/layers.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2d/__init__.py (59%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2d/_operation.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2d/_utils.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2d/layers.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/__init__.py (59%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/_operation.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/_utils.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/layers.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_3d/__init__.py (62%)
 rename colossalai/{ => legacy}/nn/layer/parallel_3d/_operation.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_3d/_utils.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_3d/layers.py (99%)
 rename colossalai/{ => legacy}/nn/layer/parallel_sequence/__init__.py (74%)
 rename colossalai/{ => legacy}/nn/layer/parallel_sequence/_operation.py (98%)
 rename colossalai/{ => legacy}/nn/layer/parallel_sequence/_utils.py (100%)
 rename colossalai/{ => legacy}/nn/layer/parallel_sequence/layers.py (99%)
 create mode 100644 colossalai/legacy/nn/layer/utils/__init__.py
 rename colossalai/{ => legacy}/nn/layer/utils/common.py (99%)
 rename colossalai/{ => legacy}/nn/layer/vanilla/__init__.py (100%)
 rename colossalai/{ => legacy}/nn/layer/vanilla/layers.py (100%)
 rename colossalai/{ => legacy}/nn/layer/wrapper/__init__.py (100%)
 rename colossalai/{ => legacy}/nn/layer/wrapper/pipeline_wrapper.py (99%)
 create mode 100644 colossalai/legacy/nn/loss/__init__.py
 rename colossalai/{ => legacy}/nn/loss/loss_1d.py (100%)
 rename colossalai/{ => legacy}/nn/loss/loss_2d.py (97%)
 rename colossalai/{ => legacy}/nn/loss/loss_2p5d.py (96%)
 rename colossalai/{ => legacy}/nn/loss/loss_3d.py (97%)
 rename colossalai/{ => legacy}/nn/metric/__init__.py (87%)
 rename colossalai/{ => legacy}/nn/metric/_utils.py (95%)
 rename colossalai/{ => legacy}/nn/metric/accuracy_2d.py (89%)
 rename colossalai/{ => legacy}/nn/metric/accuracy_2p5d.py (88%)
 rename colossalai/{ => legacy}/nn/metric/accuracy_3d.py (85%)
 rename colossalai/{ => legacy}/nn/parallel/__init__.py (100%)
 rename colossalai/{ => legacy}/nn/parallel/data_parallel.py (100%)
 rename colossalai/{ => legacy}/nn/parallel/layers/__init__.py (56%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/__init__.py (100%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/base_embedding.py (99%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/cache_mgr.py (99%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/cached_embedding.py (98%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/copyer.py (97%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/embedding_config.py (100%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py (96%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py (99%)
 rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py (99%)
 rename colossalai/{ => legacy}/nn/parallel/layers/colo_module.py (98%)
 rename colossalai/{ => legacy}/nn/parallel/layers/embedding.py (92%)
 rename colossalai/{ => legacy}/nn/parallel/layers/linear.py (93%)
 rename colossalai/{ => legacy}/nn/parallel/layers/module_utils.py (99%)
 rename colossalai/{ => legacy}/nn/parallel/reducer.py (100%)
 delete mode 100644 colossalai/nn/layer/parallel_1d/__init__.py
 create mode 100644 colossalai/nn/layer/utils.py
 delete mode 100644 colossalai/nn/layer/utils/__init__.py
 rename tests/{ => test_legacy}/test_layers/test_1d/checks_1d/__init__.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_1d/checks_1d/check_layer_1d.py (99%)
 rename tests/{ => test_legacy}/test_layers/test_1d/checks_1d/common.py (94%)
 rename tests/{ => test_legacy}/test_layers/test_1d/test_1d.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/__init__.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/check_layer_2d.py (97%)
 rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/check_operation_2d.py (96%)
 rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/common.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_2d/test_2d.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/__init__.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py (98%)
 rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py (97%)
 rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/common.py (75%)
 rename tests/{ => test_legacy}/test_layers/test_2p5d/test_2p5d.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_3d/checks_3d/__init__.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_3d/checks_3d/check_layer_3d.py (99%)
 rename tests/{ => test_legacy}/test_layers/test_3d/checks_3d/common.py (95%)
 rename tests/{ => test_legacy}/test_layers/test_3d/test_3d.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_cache_embedding.py (99%)
 rename tests/{ => test_legacy}/test_layers/test_sequence/checks_seq/__init__.py (100%)
 rename tests/{ => test_legacy}/test_layers/test_sequence/checks_seq/check_layer_seq.py (91%)
 rename tests/{ => test_legacy}/test_layers/test_sequence/test_sequence.py (97%)

diff --git a/colossalai/auto_parallel/offload/base_offload_module.py b/colossalai/auto_parallel/offload/base_offload_module.py
index d0c328e134ff..5b9f74b132f3 100644
--- a/colossalai/auto_parallel/offload/base_offload_module.py
+++ b/colossalai/auto_parallel/offload/base_offload_module.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 
-from colossalai.nn.parallel.data_parallel import _cast_float
+from colossalai.utils import _cast_float
 from colossalai.zero.legacy.gemini.tensor_utils import free_storage
 
 from .region_manager import RegionManager
diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py
index f8fd1c41a059..385b485b6016 100644
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
@@ -1,6 +1,6 @@
 import torch
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 
 
 class MLP(torch.nn.Module):
diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py
index e20c08b051ed..8eb4e0c880a0 100644
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.nn.layer.colossalai_layer import Embedding, Linear
+from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 from colossalai.utils import get_current_device
 
 from .bias_dropout_add import bias_dropout_add_fused_train
diff --git a/colossalai/legacy/nn/__init__.py b/colossalai/legacy/nn/__init__.py
new file mode 100644
index 000000000000..500162901905
--- /dev/null
+++ b/colossalai/legacy/nn/__init__.py
@@ -0,0 +1,4 @@
+from ._ops import *
+from .layer import *
+from .loss import *
+from .metric import *
diff --git a/colossalai/nn/_ops/__init__.py b/colossalai/legacy/nn/_ops/__init__.py
similarity index 100%
rename from colossalai/nn/_ops/__init__.py
rename to colossalai/legacy/nn/_ops/__init__.py
diff --git a/colossalai/nn/_ops/_utils.py b/colossalai/legacy/nn/_ops/_utils.py
similarity index 99%
rename from colossalai/nn/_ops/_utils.py
rename to colossalai/legacy/nn/_ops/_utils.py
index 24877bbb552f..131c2154771b 100644
--- a/colossalai/nn/_ops/_utils.py
+++ b/colossalai/legacy/nn/_ops/_utils.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import divide
+from colossalai.legacy.nn.layer.utils import divide
 from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
 
 GeneralTensor = Union[ColoTensor, torch.Tensor]
@@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int):
     return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)
 
 
-### table wise embedding shard
+# table wise embedding shard
 
 
 def _all_to_all_for_tablewise(x: torch.Tensor,
diff --git a/colossalai/nn/_ops/addmm.py b/colossalai/legacy/nn/_ops/addmm.py
similarity index 100%
rename from colossalai/nn/_ops/addmm.py
rename to colossalai/legacy/nn/_ops/addmm.py
diff --git a/colossalai/nn/_ops/batch_norm.py b/colossalai/legacy/nn/_ops/batch_norm.py
similarity index 100%
rename from colossalai/nn/_ops/batch_norm.py
rename to colossalai/legacy/nn/_ops/batch_norm.py
diff --git a/colossalai/nn/_ops/element_wise.py b/colossalai/legacy/nn/_ops/element_wise.py
similarity index 100%
rename from colossalai/nn/_ops/element_wise.py
rename to colossalai/legacy/nn/_ops/element_wise.py
diff --git a/colossalai/nn/_ops/embedding.py b/colossalai/legacy/nn/_ops/embedding.py
similarity index 98%
rename from colossalai/nn/_ops/embedding.py
rename to colossalai/legacy/nn/_ops/embedding.py
index a045f305b5dc..b145d1763380 100644
--- a/colossalai/nn/_ops/embedding.py
+++ b/colossalai/legacy/nn/_ops/embedding.py
@@ -1,8 +1,10 @@
-import torch.nn.functional as F
 from typing import Optional
+
+import torch.nn.functional as F
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \
-    ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
 
 
diff --git a/colossalai/nn/_ops/embedding_bag.py b/colossalai/legacy/nn/_ops/embedding_bag.py
similarity index 97%
rename from colossalai/nn/_ops/embedding_bag.py
rename to colossalai/legacy/nn/_ops/embedding_bag.py
index 0026f579b6dc..9a656d5871a3 100644
--- a/colossalai/nn/_ops/embedding_bag.py
+++ b/colossalai/legacy/nn/_ops/embedding_bag.py
@@ -1,9 +1,11 @@
-import torch.nn.functional as F
 from typing import Optional
+
+import torch.nn.functional as F
 from torch import Tensor
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
-    ShardSpec, ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor
 
 
diff --git a/colossalai/nn/_ops/layernorm.py b/colossalai/legacy/nn/_ops/layernorm.py
similarity index 92%
rename from colossalai/nn/_ops/layernorm.py
rename to colossalai/legacy/nn/_ops/layernorm.py
index 2b761b84e3ee..9960c5d48096 100644
--- a/colossalai/nn/_ops/layernorm.py
+++ b/colossalai/legacy/nn/_ops/layernorm.py
@@ -1,7 +1,10 @@
 from typing import List, Optional
+
 import torch.nn.functional as F
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor
 
 
diff --git a/colossalai/nn/_ops/linear.py b/colossalai/legacy/nn/_ops/linear.py
similarity index 100%
rename from colossalai/nn/_ops/linear.py
rename to colossalai/legacy/nn/_ops/linear.py
diff --git a/colossalai/nn/_ops/loss.py b/colossalai/legacy/nn/_ops/loss.py
similarity index 96%
rename from colossalai/nn/_ops/loss.py
rename to colossalai/legacy/nn/_ops/loss.py
index 1e54f662859c..90efbfa36f2a 100644
--- a/colossalai/nn/_ops/loss.py
+++ b/colossalai/legacy/nn/_ops/loss.py
@@ -1,9 +1,12 @@
+from typing import Optional
+
 import torch
 import torch.nn.functional as F
-from typing import Optional
-from colossalai.tensor.op_wrapper import colo_op_impl
+
+from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
 from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
+from colossalai.tensor.op_wrapper import colo_op_impl
+
 from ._utils import GeneralTensor, convert_to_colo_tensor
 
 
diff --git a/colossalai/nn/_ops/view.py b/colossalai/legacy/nn/_ops/view.py
similarity index 100%
rename from colossalai/nn/_ops/view.py
rename to colossalai/legacy/nn/_ops/view.py
diff --git a/colossalai/legacy/nn/layer/__init__.py b/colossalai/legacy/nn/layer/__init__.py
new file mode 100644
index 000000000000..86961dd933a7
--- /dev/null
+++ b/colossalai/legacy/nn/layer/__init__.py
@@ -0,0 +1,9 @@
+from .colossalai_layer import *
+from .parallel_1d import *
+from .parallel_2d import *
+from .parallel_2p5d import *
+from .parallel_3d import *
+from .parallel_sequence import *
+from .utils import *
+from .vanilla import *
+from .wrapper import *
diff --git a/colossalai/nn/layer/base_layer.py b/colossalai/legacy/nn/layer/base_layer.py
similarity index 100%
rename from colossalai/nn/layer/base_layer.py
rename to colossalai/legacy/nn/layer/base_layer.py
diff --git a/colossalai/nn/layer/colossalai_layer/__init__.py b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py
similarity index 97%
rename from colossalai/nn/layer/colossalai_layer/__init__.py
rename to colossalai/legacy/nn/layer/colossalai_layer/__init__.py
index 2ae1b07a75b2..ed743820ddbc 100644
--- a/colossalai/nn/layer/colossalai_layer/__init__.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py
@@ -1,7 +1,7 @@
-from ._utils import partition_batch
-from .dropout import Dropout
-from .embedding import Embedding, PatchEmbedding
-from .linear import Classifier, Linear
-from .normalization import LayerNorm
-
-__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
+from ._utils import partition_batch
+from .dropout import Dropout
+from .embedding import Embedding, PatchEmbedding
+from .linear import Classifier, Linear
+from .normalization import LayerNorm
+
+__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
diff --git a/colossalai/nn/layer/colossalai_layer/_utils.py b/colossalai/legacy/nn/layer/colossalai_layer/_utils.py
similarity index 100%
rename from colossalai/nn/layer/colossalai_layer/_utils.py
rename to colossalai/legacy/nn/layer/colossalai_layer/_utils.py
diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
similarity index 100%
rename from colossalai/nn/layer/colossalai_layer/dropout.py
rename to colossalai/legacy/nn/layer/colossalai_layer/dropout.py
diff --git a/colossalai/nn/layer/colossalai_layer/embedding.py b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
similarity index 97%
rename from colossalai/nn/layer/colossalai_layer/embedding.py
rename to colossalai/legacy/nn/layer/colossalai_layer/embedding.py
index e5c9c46e0ff1..28bcb7ffefb0 100644
--- a/colossalai/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
@@ -1,151 +1,152 @@
-import math
-from typing import Callable
-
-from colossalai.utils import get_current_device
-from torch import dtype, nn
-
-from ... import init as init
-from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
-from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
-from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
-from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
-from ..utils import get_tensor_parallel_mode
-from ..vanilla import VanillaPatchEmbedding
-from ._utils import ColossalaiModule
-
-_parallel_embedding = {
-    '1d': Embedding1D,
-    '2d': Embedding2D,
-    '2.5d': Embedding2p5D,
-    '3d': Embedding3D,
-}
-
-_vocab_parallel_embedding = {
-    '1d': VocabParallelEmbedding1D,
-    '2d': VocabParallelEmbedding2D,
-    '2.5d': VocabParallelEmbedding2p5D,
-    '3d': VocabParallelEmbedding3D
-}
-
-_parallel_patchembedding = {
-    None: VanillaPatchEmbedding,
-    '1d': PatchEmbedding1D,
-    '2d': PatchEmbedding2D,
-    '2.5d': PatchEmbedding2p5D,
-    '3d': PatchEmbedding3D
-}
-
-
-class Embedding(ColossalaiModule):
-    r"""Embedding for colossalai.
-
-    Args:
-        num_embeddings (int): number of embeddings.
-        embedding_dim (int): dimension of embedding.
-        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-            therefore, the embedding vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”, defaults to None.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
-            he initializer of weight, defaults to normal initializer.
-
-    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-    ::
-
-        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-                    renormalized to have norm max_norm. Note: this will modify weight in-place.
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
-        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
-                    of frequency of the words in the mini-batch. Default False.
-        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
-
-    More details about ``args`` and ``kwargs`` could be found in
-    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
-    """
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: int = None,
-                 dtype: dtype = None,
-                 weight_initializer: Callable = init.normal_(),
-                 vocab_parallel_limit: int = 2048,
-                 *args,
-                 **kwargs) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is None:
-            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
-                                 **kwargs).to(dtype).to(get_current_device())
-            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
-        elif num_embeddings <= vocab_parallel_limit:
-            embed = _parallel_embedding[tensor_parallel](
-                num_embeddings,
-                embedding_dim,
-                padding_idx=padding_idx,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                *args,
-                **kwargs,
-            )
-        else:
-            embed = _vocab_parallel_embedding[tensor_parallel](
-                num_embeddings,
-                embedding_dim,
-                padding_idx=padding_idx,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                *args,
-                **kwargs,
-            )
-        super().__init__(embed)
-
-
-class PatchEmbedding(ColossalaiModule):
-    """2D Image to Patch Embedding.
-
-    Args:
-        img_size (int): image size.
-        patch_size (int): patch size.
-        in_chans (int): number of channels of input image.
-        embed_size (int): size of embedding.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-        position_embed_initializer (:class:`typing.Callable`, optional):
-            The initializer of position embedding, defaults to zeros initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(
-        self,
-        img_size: int,
-        patch_size: int,
-        in_chans: int,
-        embed_size: int,
-        dtype: dtype = None,
-        flatten: bool = True,
-        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-        position_embed_initializer: Callable = init.zeros_()
-    ) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        embed = _parallel_patchembedding[tensor_parallel](
-            img_size,
-            patch_size,
-            in_chans,
-            embed_size,
-            dtype=dtype,
-            flatten=flatten,
-            weight_initializer=weight_initializer,
-            bias_initializer=bias_initializer,
-            position_embed_initializer=position_embed_initializer,
-        )
-        super().__init__(embed)
+import math
+from typing import Callable
+
+from torch import dtype, nn
+
+from colossalai.nn import init
+from colossalai.utils import get_current_device
+
+from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
+from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
+from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
+from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import VanillaPatchEmbedding
+from ._utils import ColossalaiModule
+
+_parallel_embedding = {
+    '1d': Embedding1D,
+    '2d': Embedding2D,
+    '2.5d': Embedding2p5D,
+    '3d': Embedding3D,
+}
+
+_vocab_parallel_embedding = {
+    '1d': VocabParallelEmbedding1D,
+    '2d': VocabParallelEmbedding2D,
+    '2.5d': VocabParallelEmbedding2p5D,
+    '3d': VocabParallelEmbedding3D
+}
+
+_parallel_patchembedding = {
+    None: VanillaPatchEmbedding,
+    '1d': PatchEmbedding1D,
+    '2d': PatchEmbedding2D,
+    '2.5d': PatchEmbedding2p5D,
+    '3d': PatchEmbedding3D
+}
+
+
+class Embedding(ColossalaiModule):
+    r"""Embedding for colossalai.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 vocab_parallel_limit: int = 2048,
+                 *args,
+                 **kwargs) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is None:
+            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
+                                 **kwargs).to(dtype).to(get_current_device())
+            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
+        elif num_embeddings <= vocab_parallel_limit:
+            embed = _parallel_embedding[tensor_parallel](
+                num_embeddings,
+                embedding_dim,
+                padding_idx=padding_idx,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                *args,
+                **kwargs,
+            )
+        else:
+            embed = _vocab_parallel_embedding[tensor_parallel](
+                num_embeddings,
+                embedding_dim,
+                padding_idx=padding_idx,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                *args,
+                **kwargs,
+            )
+        super().__init__(embed)
+
+
+class PatchEmbedding(ColossalaiModule):
+    """2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        embed_size: int,
+        dtype: dtype = None,
+        flatten: bool = True,
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        position_embed_initializer: Callable = init.zeros_()
+    ) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        embed = _parallel_patchembedding[tensor_parallel](
+            img_size,
+            patch_size,
+            in_chans,
+            embed_size,
+            dtype=dtype,
+            flatten=flatten,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            position_embed_initializer=position_embed_initializer,
+        )
+        super().__init__(embed)
diff --git a/colossalai/nn/layer/colossalai_layer/linear.py b/colossalai/legacy/nn/layer/colossalai_layer/linear.py
similarity index 99%
rename from colossalai/nn/layer/colossalai_layer/linear.py
rename to colossalai/legacy/nn/layer/colossalai_layer/linear.py
index 3e0c6e285c1c..c05ceb66ce25 100644
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/linear.py
@@ -4,9 +4,9 @@
 
 from torch import dtype, nn
 
+from colossalai.nn import init
 from colossalai.utils import get_current_device
 
-from ... import init as init
 from ..parallel_1d import *
 from ..parallel_2d import *
 from ..parallel_2p5d import *
diff --git a/colossalai/nn/layer/colossalai_layer/normalization.py b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
similarity index 97%
rename from colossalai/nn/layer/colossalai_layer/normalization.py
rename to colossalai/legacy/nn/layer/colossalai_layer/normalization.py
index 86861d30214a..f8e317e723f1 100644
--- a/colossalai/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
@@ -1,41 +1,42 @@
-from colossalai.utils import get_current_device
-from torch import nn
-
-from ..parallel_1d import LayerNorm1D
-from ..parallel_2d import LayerNorm2D
-from ..parallel_2p5d import LayerNorm2p5D
-from ..parallel_3d import LayerNorm3D
-from ..utils import get_tensor_parallel_mode
-from ..vanilla import VanillaLayerNorm
-from ._utils import ColossalaiModule
-
-_parallel_layernorm = {
-    None: VanillaLayerNorm,
-    "1d": LayerNorm1D,
-    "2d": LayerNorm2D,
-    "2.5d": LayerNorm2p5D,
-    "3d": LayerNorm3D,
-}
-
-
-class LayerNorm(ColossalaiModule):
-    r"""Layer Normalization for colossalai.
-
-    Args:
-        normalized_shape (int): input shape from an expected input of size.
-            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-            \times \ldots \times \text{normalized_shape}[-1]]`
-            If a single integer is used, it is treated as a singleton list, and this module will
-            normalize over the last dimension which is expected to be of that specific size.
-        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
-        bias (bool, optional): Whether to add a bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    """
-
-    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is None:
-            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
-        else:
-            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
-        super().__init__(norm)
+from torch import nn
+
+from colossalai.utils import get_current_device
+
+from ..parallel_1d import LayerNorm1D
+from ..parallel_2d import LayerNorm2D
+from ..parallel_2p5d import LayerNorm2p5D
+from ..parallel_3d import LayerNorm3D
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import VanillaLayerNorm
+from ._utils import ColossalaiModule
+
+_parallel_layernorm = {
+    None: VanillaLayerNorm,
+    "1d": LayerNorm1D,
+    "2d": LayerNorm2D,
+    "2.5d": LayerNorm2p5D,
+    "3d": LayerNorm3D,
+}
+
+
+class LayerNorm(ColossalaiModule):
+    r"""Layer Normalization for colossalai.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
+        bias (bool, optional): Whether to add a bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+    """
+
+    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is None:
+            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
+        else:
+            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
+        super().__init__(norm)
diff --git a/colossalai/legacy/nn/layer/parallel_1d/__init__.py b/colossalai/legacy/nn/layer/parallel_1d/__init__.py
new file mode 100644
index 000000000000..9cffd4d339f5
--- /dev/null
+++ b/colossalai/legacy/nn/layer/parallel_1d/__init__.py
@@ -0,0 +1,17 @@
+from .layers import (
+    Classifier1D,
+    Dropout1D,
+    Embedding1D,
+    LayerNorm1D,
+    Linear1D,
+    Linear1D_Col,
+    Linear1D_Row,
+    PatchEmbedding1D,
+    VocabParallelClassifier1D,
+    VocabParallelEmbedding1D,
+)
+
+__all__ = [
+    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
+    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
+]
diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
similarity index 100%
rename from colossalai/nn/layer/parallel_1d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_1d/_operation.py
diff --git a/colossalai/nn/layer/parallel_1d/_utils.py b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
similarity index 99%
rename from colossalai/nn/layer/parallel_1d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_1d/_utils.py
index 1212d595635d..fddf4e73db51 100644
--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.distributed as dist
+
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 
@@ -124,7 +125,7 @@ def backward(ctx, grad_output):
 class _SplitForwardGatherBackward(torch.autograd.Function):
     """
     Split the input and keep only the corresponding chuck to the rank.
-    
+
     Args:
         input_: input matrix.
         parallel_mode: parallel mode.
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/legacy/nn/layer/parallel_1d/layers.py
similarity index 100%
rename from colossalai/nn/layer/parallel_1d/layers.py
rename to colossalai/legacy/nn/layer/parallel_1d/layers.py
diff --git a/colossalai/nn/layer/parallel_2d/__init__.py b/colossalai/legacy/nn/layer/parallel_2d/__init__.py
similarity index 59%
rename from colossalai/nn/layer/parallel_2d/__init__.py
rename to colossalai/legacy/nn/layer/parallel_2d/__init__.py
index 5562d1a70036..9c65f3608710 100644
--- a/colossalai/nn/layer/parallel_2d/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/__init__.py
@@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2d, split_batch_2d
-from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
-                     VocabParallelEmbedding2D)
+from .layers import (
+    Classifier2D,
+    Embedding2D,
+    LayerNorm2D,
+    Linear2D,
+    PatchEmbedding2D,
+    VocabParallelClassifier2D,
+    VocabParallelEmbedding2D,
+)
 
 __all__ = [
     'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
diff --git a/colossalai/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_2d/_operation.py
diff --git a/colossalai/nn/layer/parallel_2d/_utils.py b/colossalai/legacy/nn/layer/parallel_2d/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_2d/_utils.py
diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/legacy/nn/layer/parallel_2d/layers.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2d/layers.py
rename to colossalai/legacy/nn/layer/parallel_2d/layers.py
diff --git a/colossalai/nn/layer/parallel_2p5d/__init__.py b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py
similarity index 59%
rename from colossalai/nn/layer/parallel_2p5d/__init__.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/__init__.py
index bec3b1c4b0b8..23e47e6ed06b 100644
--- a/colossalai/nn/layer/parallel_2p5d/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py
@@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
-from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D,
-                     VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D)
+from .layers import (
+    Classifier2p5D,
+    Embedding2p5D,
+    LayerNorm2p5D,
+    Linear2p5D,
+    PatchEmbedding2p5D,
+    VocabParallelClassifier2p5D,
+    VocabParallelEmbedding2p5D,
+)
 
 __all__ = [
     'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
diff --git a/colossalai/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2p5d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
diff --git a/colossalai/nn/layer/parallel_2p5d/_utils.py b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2p5d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2p5d/layers.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/layers.py
diff --git a/colossalai/nn/layer/parallel_3d/__init__.py b/colossalai/legacy/nn/layer/parallel_3d/__init__.py
similarity index 62%
rename from colossalai/nn/layer/parallel_3d/__init__.py
rename to colossalai/legacy/nn/layer/parallel_3d/__init__.py
index 9ae255b449ee..17fe8403c585 100644
--- a/colossalai/nn/layer/parallel_3d/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/__init__.py
@@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
-from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D,
-                     VocabParallelEmbedding3D)
+from .layers import (
+    Classifier3D,
+    Embedding3D,
+    LayerNorm3D,
+    Linear3D,
+    PatchEmbedding3D,
+    VocabParallelClassifier3D,
+    VocabParallelEmbedding3D,
+)
 
 __all__ = [
     'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
diff --git a/colossalai/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
similarity index 100%
rename from colossalai/nn/layer/parallel_3d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_3d/_operation.py
diff --git a/colossalai/nn/layer/parallel_3d/_utils.py b/colossalai/legacy/nn/layer/parallel_3d/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_3d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_3d/_utils.py
diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/legacy/nn/layer/parallel_3d/layers.py
similarity index 99%
rename from colossalai/nn/layer/parallel_3d/layers.py
rename to colossalai/legacy/nn/layer/parallel_3d/layers.py
index 2861b53013e1..b815a842ca52 100644
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py
@@ -13,9 +13,9 @@
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import all_reduce, broadcast
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
-from colossalai.nn.layer.base_layer import ParallelLayer
 from colossalai.utils.checkpointing import (
     broadcast_state_dict,
     gather_tensor_parallel_state_dict,
diff --git a/colossalai/nn/layer/parallel_sequence/__init__.py b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py
similarity index 74%
rename from colossalai/nn/layer/parallel_sequence/__init__.py
rename to colossalai/legacy/nn/layer/parallel_sequence/__init__.py
index 4fa9eed6f34b..d92d66d40a8e 100644
--- a/colossalai/nn/layer/parallel_sequence/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py
@@ -1,4 +1,4 @@
-from ._operation import RingQK, RingAV
+from ._operation import RingAV, RingQK
 from .layers import TransformerSelfAttentionRing
 
 __all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
diff --git a/colossalai/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
similarity index 98%
rename from colossalai/nn/layer/parallel_sequence/_operation.py
rename to colossalai/legacy/nn/layer/parallel_sequence/_operation.py
index d03102527caa..fcf2962017a3 100644
--- a/colossalai/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@@ -8,7 +8,7 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import ring_forward
-from colossalai.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
+from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
 from colossalai.utils import get_current_device
 
 
diff --git a/colossalai/nn/layer/parallel_sequence/_utils.py b/colossalai/legacy/nn/layer/parallel_sequence/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_sequence/_utils.py
rename to colossalai/legacy/nn/layer/parallel_sequence/_utils.py
diff --git a/colossalai/nn/layer/parallel_sequence/layers.py b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
similarity index 99%
rename from colossalai/nn/layer/parallel_sequence/layers.py
rename to colossalai/legacy/nn/layer/parallel_sequence/layers.py
index 4d0ff2e0605b..e44e61c2fb7d 100644
--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
@@ -14,8 +14,8 @@
 from colossalai.core import global_context as gpc
 from colossalai.kernel import FusedScaleMaskSoftmax
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
 
 
 @LAYERS.register_module
diff --git a/colossalai/legacy/nn/layer/utils/__init__.py b/colossalai/legacy/nn/layer/utils/__init__.py
new file mode 100644
index 000000000000..56e969bfd0bd
--- /dev/null
+++ b/colossalai/legacy/nn/layer/utils/__init__.py
@@ -0,0 +1,15 @@
+from .common import (
+    ACT2FN,
+    CheckpointModule,
+    _ntuple,
+    divide,
+    get_tensor_parallel_mode,
+    set_tensor_parallel_attribute_by_partition,
+    set_tensor_parallel_attribute_by_size,
+    to_2tuple,
+)
+
+__all__ = [
+    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
+    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
+]
diff --git a/colossalai/nn/layer/utils/common.py b/colossalai/legacy/nn/layer/utils/common.py
similarity index 99%
rename from colossalai/nn/layer/utils/common.py
rename to colossalai/legacy/nn/layer/utils/common.py
index f2297304fdc9..d8f3ad2a7eca 100644
--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/legacy/nn/layer/utils/common.py
@@ -6,10 +6,11 @@
 
 import numpy as np
 import torch
+from torch import Tensor, nn
+
 from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.utils import checkpoint
-from torch import Tensor, nn
 
 
 class CheckpointModule(nn.Module):
diff --git a/colossalai/nn/layer/vanilla/__init__.py b/colossalai/legacy/nn/layer/vanilla/__init__.py
similarity index 100%
rename from colossalai/nn/layer/vanilla/__init__.py
rename to colossalai/legacy/nn/layer/vanilla/__init__.py
diff --git a/colossalai/nn/layer/vanilla/layers.py b/colossalai/legacy/nn/layer/vanilla/layers.py
similarity index 100%
rename from colossalai/nn/layer/vanilla/layers.py
rename to colossalai/legacy/nn/layer/vanilla/layers.py
diff --git a/colossalai/nn/layer/wrapper/__init__.py b/colossalai/legacy/nn/layer/wrapper/__init__.py
similarity index 100%
rename from colossalai/nn/layer/wrapper/__init__.py
rename to colossalai/legacy/nn/layer/wrapper/__init__.py
diff --git a/colossalai/nn/layer/wrapper/pipeline_wrapper.py b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
similarity index 99%
rename from colossalai/nn/layer/wrapper/pipeline_wrapper.py
rename to colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
index ef1d794cc68f..68fea8622c5c 100644
--- a/colossalai/nn/layer/wrapper/pipeline_wrapper.py
+++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
@@ -1,6 +1,8 @@
-import torch.nn as nn
-import torch.distributed as dist
 from typing import List, Tuple, Union
+
+import torch.distributed as dist
+import torch.nn as nn
+
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 
diff --git a/colossalai/legacy/nn/loss/__init__.py b/colossalai/legacy/nn/loss/__init__.py
new file mode 100644
index 000000000000..1bd8872d9c3a
--- /dev/null
+++ b/colossalai/legacy/nn/loss/__init__.py
@@ -0,0 +1,41 @@
+from torch import nn
+from torch.nn.modules.loss import *
+from torch.nn.modules.loss import _Loss
+
+from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
+
+from .loss_1d import VocabParallelCrossEntropyLoss1D
+from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
+from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
+from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
+
+_parallel_cross_entropy = {
+    '2d': CrossEntropyLoss2D,
+    '2.5d': CrossEntropyLoss2p5D,
+    '3d': CrossEntropyLoss3D,
+}
+
+_vocab_parallel_cross_entropy = {
+    '1d': VocabParallelCrossEntropyLoss1D,
+    '2d': VocabParallelCrossEntropyLoss2D,
+    '2.5d': VocabParallelCrossEntropyLoss2p5D,
+    '3d': VocabParallelCrossEntropyLoss3D,
+}
+
+
+class CrossEntropyLoss(_Loss):
+
+    def __init__(self, reduction: bool = True, *args, **kwargs):
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is not None and env.vocab_parallel:
+            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
+        elif tensor_parallel is None or tensor_parallel == '1d':
+            reduction = 'mean' if reduction else 'none'
+            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
+        else:
+            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
+
+    def forward(self, *args):
+        return self.loss(*args)
diff --git a/colossalai/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py
similarity index 100%
rename from colossalai/nn/loss/loss_1d.py
rename to colossalai/legacy/nn/loss/loss_1d.py
diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py
similarity index 97%
rename from colossalai/nn/loss/loss_2d.py
rename to colossalai/legacy/nn/loss/loss_2d.py
index 6db40c0f3a04..6191602b71ee 100644
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@@ -6,9 +6,9 @@
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
-from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.utils import get_current_device
 
 
diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py
similarity index 96%
rename from colossalai/nn/loss/loss_2p5d.py
rename to colossalai/legacy/nn/loss/loss_2p5d.py
index 9c78a1ef0331..2746b201152c 100644
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@@ -6,9 +6,9 @@
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
-from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.utils import get_current_device
 
 
diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py
similarity index 97%
rename from colossalai/nn/loss/loss_3d.py
rename to colossalai/legacy/nn/loss/loss_3d.py
index 5c0f266401d1..2aeb1bd9825d 100644
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@@ -6,9 +6,9 @@
 
 from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.utils import get_current_device
 
 
diff --git a/colossalai/nn/metric/__init__.py b/colossalai/legacy/nn/metric/__init__.py
similarity index 87%
rename from colossalai/nn/metric/__init__.py
rename to colossalai/legacy/nn/metric/__init__.py
index 00833b6119c1..76c6dac89c5b 100644
--- a/colossalai/nn/metric/__init__.py
+++ b/colossalai/legacy/nn/metric/__init__.py
@@ -1,26 +1,28 @@
-from torch import nn
-
-from ._utils import calc_acc
-from .accuracy_2d import Accuracy2D
-from .accuracy_2p5d import Accuracy2p5D
-from .accuracy_3d import Accuracy3D
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-
-_parallel_accuracy = {
-    '2d': Accuracy2D,
-    '2.5d': Accuracy2p5D,
-    '3d': Accuracy3D,
-}
-
-
-class Accuracy(nn.Module):
-    def __init__(self):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel not in _parallel_accuracy:
-            self.acc = calc_acc
-        else:
-            self.acc = _parallel_accuracy[tensor_parallel]()
-
-    def forward(self, *args):
-        return self.acc(*args)
+from torch import nn
+
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
+
+from ._utils import calc_acc
+from .accuracy_2d import Accuracy2D
+from .accuracy_2p5d import Accuracy2p5D
+from .accuracy_3d import Accuracy3D
+
+_parallel_accuracy = {
+    '2d': Accuracy2D,
+    '2.5d': Accuracy2p5D,
+    '3d': Accuracy3D,
+}
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel not in _parallel_accuracy:
+            self.acc = calc_acc
+        else:
+            self.acc = _parallel_accuracy[tensor_parallel]()
+
+    def forward(self, *args):
+        return self.acc(*args)
diff --git a/colossalai/nn/metric/_utils.py b/colossalai/legacy/nn/metric/_utils.py
similarity index 95%
rename from colossalai/nn/metric/_utils.py
rename to colossalai/legacy/nn/metric/_utils.py
index eac591b64c65..8706ffc101b0 100644
--- a/colossalai/nn/metric/_utils.py
+++ b/colossalai/legacy/nn/metric/_utils.py
@@ -1,7 +1,7 @@
-import torch
-
-
-def calc_acc(logits, targets):
-    preds = torch.argmax(logits, dim=-1)
-    correct = torch.sum(targets == preds)
-    return correct
+import torch
+
+
+def calc_acc(logits, targets):
+    preds = torch.argmax(logits, dim=-1)
+    correct = torch.sum(targets == preds)
+    return correct
diff --git a/colossalai/nn/metric/accuracy_2d.py b/colossalai/legacy/nn/metric/accuracy_2d.py
similarity index 89%
rename from colossalai/nn/metric/accuracy_2d.py
rename to colossalai/legacy/nn/metric/accuracy_2d.py
index a86832973cfd..838c48834a96 100644
--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2d.py
@@ -1,7 +1,8 @@
 import torch
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from torch import nn
 
+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+
 from ._utils import calc_acc
 
 
diff --git a/colossalai/nn/metric/accuracy_2p5d.py b/colossalai/legacy/nn/metric/accuracy_2p5d.py
similarity index 88%
rename from colossalai/nn/metric/accuracy_2p5d.py
rename to colossalai/legacy/nn/metric/accuracy_2p5d.py
index 3044da065de1..183380cd9846 100644
--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2p5d.py
@@ -1,7 +1,8 @@
 import torch
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from torch import nn
 
+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+
 from ._utils import calc_acc
 
 
diff --git a/colossalai/nn/metric/accuracy_3d.py b/colossalai/legacy/nn/metric/accuracy_3d.py
similarity index 85%
rename from colossalai/nn/metric/accuracy_3d.py
rename to colossalai/legacy/nn/metric/accuracy_3d.py
index 5506fc1d2ffc..1aaac73ecabd 100644
--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/legacy/nn/metric/accuracy_3d.py
@@ -1,33 +1,35 @@
-import torch
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
-from torch import nn
-
-from ._utils import calc_acc
-
-
-class Accuracy3D(nn.Module):
-    """Accuracy for 3D parallelism
-    """
-    def __init__(self):
-        super().__init__()
-        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-
-    def forward(self, logits, targets):
-        """Calculate the accuracy of predicted labels.
-
-        Args:
-            logits (:class:`torch.tensor`): Predicted labels.
-            targets (:class:`torch.tensor`): True labels from data.
-
-        Returns:
-            float: the accuracy of prediction.
-         """
-        with torch.no_grad():
-            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
-            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
-            correct = calc_acc(logits, targets)
-            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
-        return correct
+import torch
+from torch import nn
+
+from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+
+from ._utils import calc_acc
+
+
+class Accuracy3D(nn.Module):
+    """Accuracy for 3D parallelism
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+
+    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        Args:
+            logits (:class:`torch.tensor`): Predicted labels.
+            targets (:class:`torch.tensor`): True labels from data.
+
+        Returns:
+            float: the accuracy of prediction.
+         """
+        with torch.no_grad():
+            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
+            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
+            correct = calc_acc(logits, targets)
+            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
+        return correct
diff --git a/colossalai/nn/parallel/__init__.py b/colossalai/legacy/nn/parallel/__init__.py
similarity index 100%
rename from colossalai/nn/parallel/__init__.py
rename to colossalai/legacy/nn/parallel/__init__.py
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/legacy/nn/parallel/data_parallel.py
similarity index 100%
rename from colossalai/nn/parallel/data_parallel.py
rename to colossalai/legacy/nn/parallel/data_parallel.py
diff --git a/colossalai/nn/parallel/layers/__init__.py b/colossalai/legacy/nn/parallel/layers/__init__.py
similarity index 56%
rename from colossalai/nn/parallel/layers/__init__.py
rename to colossalai/legacy/nn/parallel/layers/__init__.py
index 29b8353e63c5..f38124efedf7 100644
--- a/colossalai/nn/parallel/layers/__init__.py
+++ b/colossalai/legacy/nn/parallel/layers/__init__.py
@@ -1,10 +1,17 @@
+from .cache_embedding import (
+    CachedEmbeddingBag,
+    CachedParamMgr,
+    EvictionStrategy,
+    LimitBuffIndexCopyer,
+    ParallelCachedEmbeddingBag,
+    ParallelCachedEmbeddingBagTablewise,
+    ParallelCachedEmbeddingBagTablewiseSpiltCache,
+    TablewiseEmbeddingBagConfig,
+)
 from .colo_module import ColoModule
-from .linear import ColoLinear
 from .embedding import ColoEmbedding
-from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
-
-from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
-    ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
+from .linear import ColoLinear
+from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
 
 __all__ = [
     'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
diff --git a/colossalai/nn/parallel/layers/cache_embedding/__init__.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py
similarity index 100%
rename from colossalai/nn/parallel/layers/cache_embedding/__init__.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py
index 5bbc931a79dc..d87930c1c6b3 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/__init__.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py
@@ -1,8 +1,8 @@
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from .copyer import LimitBuffIndexCopyer
 from .cached_embedding import CachedEmbeddingBag
-from .parallel_cached_embedding import ParallelCachedEmbeddingBag
+from .copyer import LimitBuffIndexCopyer
 from .embedding_config import TablewiseEmbeddingBagConfig
+from .parallel_cached_embedding import ParallelCachedEmbeddingBag
 from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
 from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/base_embedding.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
index 705835a0ed22..9558c541e703 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
@@ -1,4 +1,5 @@
 import abc
+
 import torch.nn as nn
 
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
index a6159856dcce..16530c4ce7b8 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
@@ -1,12 +1,14 @@
+import sys
+from contextlib import contextmanager
+from enum import Enum
+from typing import List, Optional
+
 import numpy as np
 import torch
-from torch.profiler import record_function
-from typing import List, Optional
 from contexttimer import Timer
+from torch.profiler import record_function
+
 from .copyer import LimitBuffIndexCopyer
-from enum import Enum
-import sys
-from contextlib import contextmanager
 
 
 class EvictionStrategy(Enum):
@@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
 class CachedParamMgr(torch.nn.Module):
     """
     Manage Embedding Weights on CPU and CUDA memory uses a software cache.
-    CPU maintains the entire original weight. 
+    CPU maintains the entire original weight.
     CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
     During training, GPU needs to transmit embedding rows between CPU and GPU.
     Args:
@@ -115,7 +117,7 @@ def timer(self, name):
         self._elapsed_dict[name] += t.elapsed
 
     def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
-        """_find_evict_gpu_idxs 
+        """_find_evict_gpu_idxs
         Find the gpu idxs to be evicted, according to their freq.
         Args:
             evict_num (int): how many rows has to be evicted
@@ -202,7 +204,7 @@ def reorder(self, ids_freq_mapping: Optional[List[int]] = None, warmup_ratio=0.7
         """reorder
         reorder the weight according to ids' frequency in dataset before training.
         Execute only once before training, also known as warmup phase.
-        
+
         Note:
             If you would like to use the DATASET as the eviction strategy, you must call this function.
         Note:
@@ -516,7 +518,7 @@ def _evict(self) -> int:
         """
         deprecated
         evict one row from cuda to cpu.
-        Returns: 
+        Returns:
         (int) : the slot id be evicted.
         """
         mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
diff --git a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
similarity index 98%
rename from colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
index a74cb8d94bab..bc7d178906da 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
@@ -1,10 +1,11 @@
+from typing import Iterator, List, Optional, Tuple, Union
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple, Union
+from torch.nn.parameter import Parameter
 
 from .base_embedding import BaseEmbeddingBag
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from torch.nn.parameter import Parameter
 
 
 class CachedEmbeddingBag(BaseEmbeddingBag):
@@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
         include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
         dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
         device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
-        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row 
+        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
         ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
         warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
         buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
@@ -85,10 +86,10 @@ def _preprocess(self,
                     buffer_size=50_000,
                     pin_weight=False):
         """
-        Called after initialized. 
+        Called after initialized.
         Reorder the weight rows according to the ids_freq_mapping.
         Then, let the weights of the Module be managed by a CachedParamMgr.
-        
+
         Args:
             cuda_row_num (int): number of rows can be hosted in CUDA memory
             ids_freq_mapping (List[int]): a list, idx is id number, value is freq
diff --git a/colossalai/nn/parallel/layers/cache_embedding/copyer.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
similarity index 97%
rename from colossalai/nn/parallel/layers/cache_embedding/copyer.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
index aa1f794482f9..804a07f88207 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/copyer.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
@@ -3,7 +3,7 @@
 
 
 class LimitBuffIndexCopyer(object):
-    """LimitBuffIndexCopyer 
+    """LimitBuffIndexCopyer
     Index Copy using limited temp buffer on CUDA.
 
     Args:
@@ -15,7 +15,7 @@ def __init__(self, size: int) -> None:
 
     @torch.no_grad()
     def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
-        """copy 
+        """copy
         src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
         The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/embedding_config.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
similarity index 100%
rename from colossalai/nn/parallel/layers/cache_embedding/embedding_config.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
similarity index 96%
rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
index d7f77e195f4b..79d7672b26bc 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
@@ -1,12 +1,13 @@
+from typing import Iterator, List, Optional, Tuple
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple
 
-from .cached_embedding import CachedEmbeddingBag
-from colossalai.nn._ops._utils import dual_all_to_all
+from colossalai.legacy.nn._ops._utils import dual_all_to_all
+from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
 
-from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
 from .cache_mgr import CachedParamMgr, EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
 
 
 def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
index 949f85ad4baf..116d836b7139 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
@@ -1,15 +1,16 @@
+import time
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 
-from .cached_embedding import CachedEmbeddingBag
-from .cache_mgr import EvictionStrategy
-from .embedding_config import TablewiseEmbeddingBagConfig
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise
 
-from typing import List
-import time
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig
 
 
 class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
index 80a54b4fadd4..0014c784fba1 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
@@ -1,17 +1,17 @@
+import abc
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.profiler import record_function
 
-from .cached_embedding import CachedEmbeddingBag
-
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise
-from .embedding_config import TablewiseEmbeddingBagConfig
-from .cache_mgr import EvictionStrategy
 
-from typing import List
-import abc
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig
 
 
 class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
diff --git a/colossalai/nn/parallel/layers/colo_module.py b/colossalai/legacy/nn/parallel/layers/colo_module.py
similarity index 98%
rename from colossalai/nn/parallel/layers/colo_module.py
rename to colossalai/legacy/nn/parallel/layers/colo_module.py
index 8f0f5d5f520a..a0a3eb40cf08 100644
--- a/colossalai/nn/parallel/layers/colo_module.py
+++ b/colossalai/legacy/nn/parallel/layers/colo_module.py
@@ -1,6 +1,7 @@
-from colossalai.tensor.distspec import _DistSpec
+from typing import Dict, List
+
 from colossalai.tensor import ComputePattern
-from typing import List, Dict
+from colossalai.tensor.distspec import _DistSpec
 
 
 class ColoModule(object):
diff --git a/colossalai/nn/parallel/layers/embedding.py b/colossalai/legacy/nn/parallel/layers/embedding.py
similarity index 92%
rename from colossalai/nn/parallel/layers/embedding.py
rename to colossalai/legacy/nn/parallel/layers/embedding.py
index ccacc1ead297..3e4e7ffd8de7 100644
--- a/colossalai/nn/parallel/layers/embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/embedding.py
@@ -1,5 +1,6 @@
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
 
 
 class ColoEmbedding(ColoModule):
diff --git a/colossalai/nn/parallel/layers/linear.py b/colossalai/legacy/nn/parallel/layers/linear.py
similarity index 93%
rename from colossalai/nn/parallel/layers/linear.py
rename to colossalai/legacy/nn/parallel/layers/linear.py
index 84a8c042587d..e391cf808933 100644
--- a/colossalai/nn/parallel/layers/linear.py
+++ b/colossalai/legacy/nn/parallel/layers/linear.py
@@ -1,5 +1,6 @@
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
 
 
 class ColoLinear(ColoModule):
diff --git a/colossalai/nn/parallel/layers/module_utils.py b/colossalai/legacy/nn/parallel/layers/module_utils.py
similarity index 99%
rename from colossalai/nn/parallel/layers/module_utils.py
rename to colossalai/legacy/nn/parallel/layers/module_utils.py
index 38d128cc705e..191266fa70fd 100644
--- a/colossalai/nn/parallel/layers/module_utils.py
+++ b/colossalai/legacy/nn/parallel/layers/module_utils.py
@@ -1,9 +1,11 @@
 from typing import Dict
-from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
-from colossalai.tensor import distspec
-from . import ColoModule
+
 import torch
 
+from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
+
+from . import ColoModule
+
 _COLOSSAL_MODULES: Dict[type, ColoModule] = {}
 
 
diff --git a/colossalai/nn/parallel/reducer.py b/colossalai/legacy/nn/parallel/reducer.py
similarity index 100%
rename from colossalai/nn/parallel/reducer.py
rename to colossalai/legacy/nn/parallel/reducer.py
diff --git a/colossalai/nn/__init__.py b/colossalai/nn/__init__.py
index 910ad203180c..5ea46f7dd7bd 100644
--- a/colossalai/nn/__init__.py
+++ b/colossalai/nn/__init__.py
@@ -1,6 +1,4 @@
-from ._ops import *
 from .layer import *
 from .loss import *
 from .lr_scheduler import *
-from .metric import *
 from .optimizer import *
diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py
index b705632f8040..edd986ef5e82 100644
--- a/colossalai/nn/layer/__init__.py
+++ b/colossalai/nn/layer/__init__.py
@@ -1,10 +1,2 @@
-from .colossalai_layer import *
-from .parallel_1d import *
-from .parallel_2d import *
-from .parallel_2p5d import *
-from .parallel_3d import *
-from .parallel_sequence import *
 from .moe import *
 from .utils import *
-from .vanilla import *
-from .wrapper import *
diff --git a/colossalai/nn/layer/parallel_1d/__init__.py b/colossalai/nn/layer/parallel_1d/__init__.py
deleted file mode 100644
index 2353851df665..000000000000
--- a/colossalai/nn/layer/parallel_1d/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
-                     PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
-
-__all__ = [
-    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
-    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
-]
diff --git a/colossalai/nn/layer/utils.py b/colossalai/nn/layer/utils.py
new file mode 100644
index 000000000000..dc12ff8daa4e
--- /dev/null
+++ b/colossalai/nn/layer/utils.py
@@ -0,0 +1,14 @@
+def divide(numerator, denominator):
+    """Only allow exact division.
+
+    Args:
+        numerator (int): Numerator of the division.
+        denominator (int): Denominator of the division.
+
+    Returns:
+        int: the result of exact division.
+    """
+    assert denominator != 0, 'denominator can not be zero'
+    assert numerator % denominator == 0, \
+        '{} is not divisible by {}'.format(numerator, denominator)
+    return numerator // denominator
diff --git a/colossalai/nn/layer/utils/__init__.py b/colossalai/nn/layer/utils/__init__.py
deleted file mode 100644
index 7e999ee82149..000000000000
--- a/colossalai/nn/layer/utils/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
-                     set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
-
-__all__ = [
-    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
-    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
-]
diff --git a/colossalai/nn/loss/__init__.py b/colossalai/nn/loss/__init__.py
index 373e4ec9468b..ee2add48ab91 100644
--- a/colossalai/nn/loss/__init__.py
+++ b/colossalai/nn/loss/__init__.py
@@ -1,41 +1 @@
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-from torch import nn
-from torch.nn.modules.loss import *
-from torch.nn.modules.loss import _Loss
-
-from .loss_1d import VocabParallelCrossEntropyLoss1D
-from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
-from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
-from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
 from .loss_moe import MoeCrossEntropyLoss, MoeLoss
-
-_parallel_cross_entropy = {
-    '2d': CrossEntropyLoss2D,
-    '2.5d': CrossEntropyLoss2p5D,
-    '3d': CrossEntropyLoss3D,
-}
-
-_vocab_parallel_cross_entropy = {
-    '1d': VocabParallelCrossEntropyLoss1D,
-    '2d': VocabParallelCrossEntropyLoss2D,
-    '2.5d': VocabParallelCrossEntropyLoss2p5D,
-    '3d': VocabParallelCrossEntropyLoss3D,
-}
-
-
-class CrossEntropyLoss(_Loss):
-
-    def __init__(self, reduction: bool = True, *args, **kwargs):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is not None and env.vocab_parallel:
-            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-        elif tensor_parallel is None or tensor_parallel == '1d':
-            reduction = 'mean' if reduction else 'none'
-            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
-        else:
-            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-
-    def forward(self, *args):
-        return self.loss(*args)
diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/pipeline/pipelinable.py
index 79913987b7cc..ba8b1591da9d 100644
--- a/colossalai/pipeline/pipelinable.py
+++ b/colossalai/pipeline/pipelinable.py
@@ -1,15 +1,24 @@
-import torch
 import inspect
-from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
 
-from .utils import partition_uniform, partition_balanced, build_kwargs_for_function, \
-                build_kwargs_for_module, exec_func_with_kwargs, exec_funcs_with_kwargs, \
-                call_module, customized_partition
-from colossalai.nn.layer.utils import CheckpointModule
-from colossalai.tensor import ColoParameter
-from colossalai.core import global_context as gpc
+import torch
+
 from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.utils import CheckpointModule
+from colossalai.tensor import ColoParameter
+from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
+
 from .layer_spec import LayerSpec
+from .utils import (
+    build_kwargs_for_function,
+    build_kwargs_for_module,
+    call_module,
+    customized_partition,
+    exec_func_with_kwargs,
+    exec_funcs_with_kwargs,
+    partition_balanced,
+    partition_uniform,
+)
 
 
 class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
diff --git a/colossalai/pipeline/utils.py b/colossalai/pipeline/utils.py
index ac8a3ad7d1db..be8428692756 100644
--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
@@ -1,12 +1,13 @@
 import heapq
 import inspect
+from collections import OrderedDict
+from typing import List
+
 import torch
 
+from colossalai.legacy.nn.layer.utils import CheckpointModule
 from colossalai.logging import get_dist_logger
-from colossalai.nn.layer.utils import CheckpointModule
-from typing import List
 
-from collections import OrderedDict
 
 def _binary_partition(weights: List, start: int, end: int):
     """Returns the binary partition position of `weights`, given the start
@@ -162,7 +163,7 @@ def build_kwargs_for_module(function, input_tensor, kw_dict):
         kwargs_offset = 1
     elif isinstance(input_tensor, (tuple, OrderedDict)):
         #assert isinstance(input_tensor, tuple), f'input_tensor should be a torch.Tensor or a tuple object.'
-        # Huggingface will take their own structures based on OrderedDict as the output 
+        # Huggingface will take their own structures based on OrderedDict as the output
         # between layers so we've to close this check.
         kwargs_offset = len(input_tensor)
     args_name_list = list(sig.parameters.keys())
@@ -256,7 +257,7 @@ def call_module(module, args=None, kwargs=None):
 
 def customized_partition(exec_seq):
     '''
-    This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an 
+    This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an
     annotation to note the partition point.
     '''
     customized_parts = {}
diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/tensor/dist_spec_mgr.py
index c968050de49d..4740a316b7f5 100644
--- a/colossalai/tensor/dist_spec_mgr.py
+++ b/colossalai/tensor/dist_spec_mgr.py
@@ -2,7 +2,6 @@
 
 import torch
 import torch.distributed as dist
-# from colossalai.nn.layer.utils import divide
 from numpy import prod
 
 from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index 7b2e8480c66c..6f9717d353e6 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,12 +1,14 @@
 from .activation_checkpoint import checkpoint
 from .checkpointing import load_checkpoint, save_checkpoint
 from .common import (
+    _cast_float,
     clip_grad_norm_fp32,
     conditional_context,
     copy_tensor_parallel_attributes,
     count_zeros_fp32,
     disposable,
     ensure_path_exists,
+    free_storage,
     is_ddp_ignored,
     is_dp_rank_0,
     is_model_parallel_parameter,
@@ -72,4 +74,6 @@
     'disposable',
     'colo_set_cpu_memory_capacity',
     'colo_get_cpu_memory_capacity',
+    '_cast_float',
+    'free_storage',
 ]
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 8022e84dc24b..998901708239 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -470,3 +470,22 @@ def wrapper(*args, **kwargs):
             return func(*args, **kwargs)
 
     return wrapper
+
+
+def free_storage(data: torch.Tensor) -> None:
+    """Free underlying storage of a Tensor."""
+    if data.storage().size() > 0:
+        # Since we're modifying the Tensor's Storage directly, make sure the Tensor
+        # is the sole occupant of the Storage.
+        assert data.storage_offset() == 0
+        data.storage().resize_(0)
+
+
+def _cast_float(args, dtype: torch.dtype):
+    if isinstance(args, torch.Tensor) and torch.is_floating_point(args):
+        args = args.to(dtype)
+    elif isinstance(args, (list, tuple)):
+        args = type(args)(_cast_float(t, dtype) for t in args)
+    elif isinstance(args, dict):
+        args = {k: _cast_float(v, dtype) for k, v in args.items()}
+    return args
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index 75f8576ca477..dad852a34a71 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -87,7 +87,7 @@ def __init__(self,
         self._default_dist_spec = default_dist_spec
 
     def _register_colo_modules(self):
-        from colossalai.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module
+        from colossalai.legacy.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module
         register_colo_module(torch.nn.Linear, ColoLinear())
         register_colo_module(torch.nn.Embedding, ColoEmbedding())
 
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 741a977d1ea0..918b08cd3150 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -10,15 +10,13 @@
 from torch.distributed import ProcessGroup
 from torch.distributed.distributed_c10d import _get_default_group
 
-from colossalai.checkpoint_io.utils import calculate_tensor_size, StateDictSharder
+from colossalai.checkpoint_io.utils import StateDictSharder, calculate_tensor_size
 from colossalai.interface import ModelWrapper
-
 from colossalai.lazy import LazyTensor
 from colossalai.logging import get_dist_logger
-from colossalai.nn.parallel.data_parallel import _cast_float, free_storage
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import get_current_device, is_ddp_ignored
+from colossalai.utils import _cast_float, free_storage, get_current_device, is_ddp_ignored
 
 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
 from .gemini_hook import GeminiZeROHook
@@ -780,5 +778,3 @@ def state_dict_shard(self,
                 yield block, block_size
 
         yield sharder.current_block, sharder.current_block_size
-
-
diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
index 0c9eac8b63e3..e5466965cc48 100644
--- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
+++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
@@ -1,7 +1,7 @@
 import torch.nn
 
-from colossalai.nn.parallel.data_parallel import _cast_float
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
+from colossalai.utils import _cast_float
 from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import (
     GradMemStats,
     GradMemTracerHook,
diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 281fd47554ca..0a94a7f5d691 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -176,7 +176,7 @@ In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overh
 
 ```python
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
+    from colossalai.zero import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
                         placement_policy=placement_policy,
diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 5aa806c64322..36c94fb492cd 100644
--- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.legacy.trainer import Trainer, hooks
 from colossalai.utils.timer import MultiTimer
 from model_zoo.gpt import GPTLMLoss
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 22022639ce12..0ec9d5c3c5de 100644
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -78,7 +78,7 @@ from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.lr_scheduler import LinearWarmupLR
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 ```
 
diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md
index 6d2355ad9044..e17c37e24a55 100644
--- a/docs/source/en/basics/engine_trainer.md
+++ b/docs/source/en/basics/engine_trainer.md
@@ -344,7 +344,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
 If you wish to train with a trainer object, you can follow the code snippet below:
 
 ```python
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 
 
diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 3f85d50454ae..dfd1e2910b4e 100644
--- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -160,7 +160,7 @@ for mn, module in model.named_modules():
 
 ```python
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
+    from colossalai.zero import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
                         placement_policy=placement_policy,
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 9cfbf58731b8..3f57f39f2838 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.legacy.trainer import Trainer, hooks
 from colossalai.utils.timer import MultiTimer
 from model_zoo.gpt import GPTLMLoss
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 803882a5ad2e..f7dd8d477a66 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -73,7 +73,7 @@ from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.lr_scheduler import LinearWarmupLR
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 ```
 
diff --git a/docs/source/zh-Hans/basics/engine_trainer.md b/docs/source/zh-Hans/basics/engine_trainer.md
index e57220292c98..ed5100299212 100644
--- a/docs/source/zh-Hans/basics/engine_trainer.md
+++ b/docs/source/zh-Hans/basics/engine_trainer.md
@@ -340,7 +340,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
 
 
 ```python
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 
 
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
index 668992901239..e521193a97da 100644
--- a/examples/language/gpt/titans/model/embed.py
+++ b/examples/language/gpt/titans/model/embed.py
@@ -8,11 +8,11 @@
 
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
+from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
+from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
+from colossalai.legacy.nn.layer.utils import divide
 from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
-from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
-from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
-from colossalai.nn.layer.utils import divide
 from colossalai.utils import get_current_device
 
 
diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py
index 2edd03606b7d..72297c540da1 100644
--- a/examples/language/gpt/titans/model/gpt1d.py
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -11,9 +11,9 @@
 from colossalai import nn as col_nn
 from colossalai.core import global_context as gpc
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
-from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
-from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.nn.layer.utils import ACT2FN, divide
+from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
+from colossalai.legacy.nn.layer.utils import ACT2FN, divide
 from colossalai.utils import checkpoint
 from colossalai.utils.activation_checkpoint import checkpoint
 
diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py
index 30180285bc70..9b22d156bbcd 100644
--- a/examples/language/gpt/titans/model/pipeline_gpt1d.py
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -9,8 +9,8 @@
 from colossalai import nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.pipeline.utils import partition_uniform
 
 from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py
index 4953d5350f31..12cdec902400 100644
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -7,8 +7,8 @@
 import colossalai
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn import CrossEntropyLoss
 from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.pipeline.pipelinable import PipelinableContext
 from colossalai.utils import is_using_pp
diff --git a/examples/tutorial/sequence_parallel/model/bert.py b/examples/tutorial/sequence_parallel/model/bert.py
index 049579c5a639..b8adb501f95e 100644
--- a/examples/tutorial/sequence_parallel/model/bert.py
+++ b/examples/tutorial/sequence_parallel/model/bert.py
@@ -1,33 +1,37 @@
-from colossalai.context.parallel_mode import ParallelMode
+import inspect
+
 import torch
 import torch.nn as nn
-import inspect
-from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding
-from .layers.init_method import init_normal, output_init_normal
-from colossalai.core import global_context as gpc
+
 from colossalai.context import ParallelMode
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.kernel import LayerNorm
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.logging import get_dist_logger
 from colossalai.pipeline.utils import partition_uniform
 
+from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding
+from .layers.init_method import init_normal, output_init_normal
+
 
 class BertForPretrain(nn.Module):
 
-    def __init__(self,
-                 vocab_size,
-                 hidden_size,
-                 max_sequence_length,
-                 num_attention_heads,
-                 num_layers,
-                 add_binary_head,
-                 is_naive_fp16,
-                 num_tokentypes=2,
-                 dropout_prob=0.1,
-                 mlp_ratio=4,
-                 init_std=0.02,
-                 convert_fp16_to_fp32_in_softmax=False,
-                 ):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        num_attention_heads,
+        num_layers,
+        add_binary_head,
+        is_naive_fp16,
+        num_tokentypes=2,
+        dropout_prob=0.1,
+        mlp_ratio=4,
+        init_std=0.02,
+        convert_fp16_to_fp32_in_softmax=False,
+    ):
         super().__init__()
         self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE)
         assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size'
@@ -47,19 +51,19 @@ def __init__(self,
         self.bert_layers = nn.ModuleList()
 
         for i in range(num_layers):
-            bert_layer = BertLayer(layer_number=i+1,
+            bert_layer = BertLayer(layer_number=i + 1,
                                    hidden_size=hidden_size,
                                    num_attention_heads=num_attention_heads,
                                    attention_dropout=dropout_prob,
                                    mlp_ratio=mlp_ratio,
                                    hidden_dropout=dropout_prob,
                                    convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-                                   is_naive_fp16=is_naive_fp16
-                                   )
+                                   is_naive_fp16=is_naive_fp16)
             self.bert_layers.append(bert_layer)
 
         self.layer_norm = LayerNorm(hidden_size)
-        self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0),
+        self.head = BertDualHead(hidden_size,
+                                 self.embedding.word_embedding_weight.size(0),
                                  add_binary_head=add_binary_head)
         self.reset_parameters()
 
@@ -166,22 +170,20 @@ def __init__(self,
             end_idx = num_layers
 
         for i in range(start_idx, end_idx):
-            bert_layer = BertLayer(layer_number=i+1,
+            bert_layer = BertLayer(layer_number=i + 1,
                                    hidden_size=hidden_size,
                                    num_attention_heads=num_attention_heads,
                                    attention_dropout=dropout_prob,
                                    mlp_ratio=mlp_ratio,
                                    hidden_dropout=dropout_prob,
                                    convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-                                   is_naive_fp16=is_naive_fp16
-                                   )
+                                   is_naive_fp16=is_naive_fp16)
             self.bert_layers.append(bert_layer)
 
         if self.last_stage:
             self.word_embeddings = VocabEmbedding(vocab_size, hidden_size)
             self.layer_norm = LayerNorm(hidden_size)
-            self.head = BertDualHead(hidden_size, vocab_size,
-                                     add_binary_head=add_binary_head)
+            self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head)
         self.reset_parameters()
 
     def _init_normal(self, tensor):
diff --git a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
index 4ede21516f65..56ba511d8274 100644
--- a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
+++ b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
@@ -1,10 +1,12 @@
 import torch
 import torch.nn as nn
-from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing
-from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
+
 from colossalai.kernel.cuda_native import LayerNorm
-from .mlp import TransformerMLP
+from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train
+from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing
+
 from .dropout import get_bias_dropout_add
+from .mlp import TransformerMLP
 
 
 def attention_mask_func(attention_scores, attention_mask):
@@ -48,8 +50,7 @@ def __init__(self,
             layer_number=layer_number,
             apply_query_key_layer_scaling=True,
             convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-            fp16=is_naive_fp16
-        )
+            fp16=is_naive_fp16)
 
         self.hidden_dropout = hidden_dropout
         self.bias_dropout_fusion = bias_dropout_fusion
@@ -89,11 +90,8 @@ def forward(self, hidden_states, attention_mask):
 
         # re-enable torch grad to enable fused optimization.
         with torch.enable_grad():
-            layernorm_input = bias_dropout_add_func(
-                attention_output,
-                attention_bias.expand_as(residual),
-                residual,
-                self.hidden_dropout)
+            layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual,
+                                                    self.hidden_dropout)
 
         # Layer norm post the self attention.
         layernorm_output = self.post_attention_layernorm(layernorm_input)
@@ -109,10 +107,6 @@ def forward(self, hidden_states, attention_mask):
 
         # re-enable torch grad to enable fused optimization.
         with torch.enable_grad():
-            output = bias_dropout_add_func(
-                mlp_output,
-                mlp_bias.expand_as(residual),
-                residual,
-                self.hidden_dropout)
+            output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout)
 
         return output
diff --git a/tests/components_to_test/hanging_param_model.py b/tests/components_to_test/hanging_param_model.py
index 329a08ea28f0..0e65431217c7 100644
--- a/tests/components_to_test/hanging_param_model.py
+++ b/tests/components_to_test/hanging_param_model.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils.dummy_data_generator import DummyDataGenerator
diff --git a/tests/components_to_test/inline_op_model.py b/tests/components_to_test/inline_op_model.py
index f061d48f92c6..80757f361d9e 100644
--- a/tests/components_to_test/inline_op_model.py
+++ b/tests/components_to_test/inline_op_model.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils.dummy_data_generator import DummyDataGenerator
diff --git a/tests/components_to_test/nested_model.py b/tests/components_to_test/nested_model.py
index 339084639244..3e779b0a6428 100644
--- a/tests/components_to_test/nested_model.py
+++ b/tests/components_to_test/nested_model.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils import DummyDataGenerator
diff --git a/tests/components_to_test/repeated_computed_layers.py b/tests/components_to_test/repeated_computed_layers.py
index b3f84bd0e203..c1ef99aa07b4 100644
--- a/tests/components_to_test/repeated_computed_layers.py
+++ b/tests/components_to_test/repeated_computed_layers.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils.dummy_data_generator import DummyDataGenerator
diff --git a/tests/components_to_test/simple_net.py b/tests/components_to_test/simple_net.py
index cd9d7ebc0b1a..064974a15a97 100644
--- a/tests/components_to_test/simple_net.py
+++ b/tests/components_to_test/simple_net.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 from colossalai.utils.cuda import get_current_device
 
 from .registry import non_distributed_component_funcs
diff --git a/tests/test_layers/test_1d/checks_1d/__init__.py b/tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py
similarity index 100%
rename from tests/test_layers/test_1d/checks_1d/__init__.py
rename to tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py
diff --git a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
similarity index 99%
rename from tests/test_layers/test_1d/checks_1d/check_layer_1d.py
rename to tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
index 668b8a334800..dcb2be62671b 100644
--- a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py
+++ b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
@@ -5,7 +5,7 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn import (
+from colossalai.legacy.nn import (
     Classifier1D,
     Embedding1D,
     Linear1D_Col,
diff --git a/tests/test_layers/test_1d/checks_1d/common.py b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py
similarity index 94%
rename from tests/test_layers/test_1d/checks_1d/common.py
rename to tests/test_legacy/test_layers/test_1d/checks_1d/common.py
index 8b7b28613d22..29a9a3d20330 100644
--- a/tests/test_layers/test_1d/checks_1d/common.py
+++ b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py
@@ -1,15 +1,16 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch
-
-DEPTH = 4
-BATCH_SIZE = 8
-SEQ_LENGTH = 8
-IMG_SIZE = 16
-HIDDEN_SIZE = 8
-NUM_CLASSES = 8
-VOCAB_SIZE = 16
-
-def check_equal(A, B):
-    assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+
+DEPTH = 4
+BATCH_SIZE = 8
+SEQ_LENGTH = 8
+IMG_SIZE = 16
+HIDDEN_SIZE = 8
+NUM_CLASSES = 8
+VOCAB_SIZE = 16
+
+
+def check_equal(A, B):
+    assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True
diff --git a/tests/test_layers/test_1d/test_1d.py b/tests/test_legacy/test_layers/test_1d/test_1d.py
similarity index 100%
rename from tests/test_layers/test_1d/test_1d.py
rename to tests/test_legacy/test_layers/test_1d/test_1d.py
diff --git a/tests/test_layers/test_2d/checks_2d/__init__.py b/tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py
similarity index 100%
rename from tests/test_layers/test_2d/checks_2d/__init__.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py
diff --git a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
similarity index 97%
rename from tests/test_layers/test_2d/checks_2d/check_layer_2d.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
index e030e473a363..0ee88c26035f 100644
--- a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py
+++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
@@ -1,12 +1,23 @@
 import torch
+
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn import (Classifier2D, CrossEntropyLoss2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D,
-                           VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2D,
-                           VocabParallelCrossEntropyLoss2D, VocabParallelEmbedding2D)
+from colossalai.legacy.nn import (
+    Classifier2D,
+    CrossEntropyLoss2D,
+    Embedding2D,
+    LayerNorm2D,
+    Linear2D,
+    PatchEmbedding2D,
+    VanillaClassifier,
+    VanillaPatchEmbedding,
+    VocabParallelClassifier2D,
+    VocabParallelCrossEntropyLoss2D,
+    VocabParallelEmbedding2D,
+)
 from colossalai.utils import get_current_device, print_rank_0
 
-from .common import (BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal)
+from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
 
 
 def check_linear():
@@ -336,7 +347,7 @@ def check_classifier_no_given_weight():
     layer.weight.data.copy_(W)
     # W.requires_grad = True
 
-    B_shape = (OUTPUT_SIZE, )
+    B_shape = (OUTPUT_SIZE,)
     B_master = torch.randint(5, B_shape, dtype=dtype, device=device)
     torch.distributed.broadcast(B_master, src=0)
     # B = torch.chunk(B_master, DEPTH, dim=0)[j]
@@ -572,7 +583,7 @@ def check_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, DEPTH, dim=0)[i]
@@ -607,7 +618,7 @@ def check_vocab_parallel_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, DEPTH, dim=0)[i]
diff --git a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
similarity index 96%
rename from tests/test_layers/test_2d/checks_2d/check_operation_2d.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
index a5e37b1ec309..ae1d1120cfb9 100644
--- a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py
+++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
@@ -5,10 +5,10 @@
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
-from colossalai.utils import get_current_device
-from colossalai.utils import print_rank_0
-from .common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH
+from colossalai.legacy.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
+from colossalai.utils import get_current_device, print_rank_0
+
+from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, SEQ_LENGTH, check_equal
 
 
 def check_AB():
diff --git a/tests/test_layers/test_2d/checks_2d/common.py b/tests/test_legacy/test_layers/test_2d/checks_2d/common.py
similarity index 100%
rename from tests/test_layers/test_2d/checks_2d/common.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/common.py
diff --git a/tests/test_layers/test_2d/test_2d.py b/tests/test_legacy/test_layers/test_2d/test_2d.py
similarity index 100%
rename from tests/test_layers/test_2d/test_2d.py
rename to tests/test_legacy/test_layers/test_2d/test_2d.py
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/__init__.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py
similarity index 100%
rename from tests/test_layers/test_2p5d/checks_2p5d/__init__.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
similarity index 98%
rename from tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
index a8f551093b1e..5a99b05cfe7e 100644
--- a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
@@ -1,11 +1,22 @@
 import torch
+from torch.nn import Parameter
+
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn import (Classifier2p5D, CrossEntropyLoss2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D,
-                           PatchEmbedding2p5D, VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2p5D,
-                           VocabParallelCrossEntropyLoss2p5D, VocabParallelEmbedding2p5D)
+from colossalai.legacy.nn import (
+    Classifier2p5D,
+    CrossEntropyLoss2p5D,
+    Embedding2p5D,
+    LayerNorm2p5D,
+    Linear2p5D,
+    PatchEmbedding2p5D,
+    VanillaClassifier,
+    VanillaPatchEmbedding,
+    VocabParallelClassifier2p5D,
+    VocabParallelCrossEntropyLoss2p5D,
+    VocabParallelEmbedding2p5D,
+)
 from colossalai.utils import get_current_device, print_rank_0
-from torch.nn import Parameter
 
 from .common import *
 
@@ -342,7 +353,7 @@ def check_classifier_no_given_weight():
     layer.weight.data.copy_(W)
     # W.requires_grad = True
 
-    B_shape = (OUTPUT_SIZE, )
+    B_shape = (OUTPUT_SIZE,)
     B_master = torch.randint(5, B_shape, dtype=dtype, device=device)
     torch.distributed.broadcast(B_master, src=0)
     # B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[j]
@@ -577,7 +588,7 @@ def check_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i]
@@ -612,7 +623,7 @@ def check_vocab_parallel_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i]
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
similarity index 97%
rename from tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
index d0c3b02fccba..db19967676d2 100644
--- a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
@@ -2,10 +2,9 @@
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, \
-    Matmul_ATB_2p5D
-from colossalai.utils import get_current_device
-from colossalai.utils import print_rank_0
+from colossalai.legacy.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, Matmul_ATB_2p5D
+from colossalai.utils import get_current_device, print_rank_0
+
 from .common import *
 
 
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/common.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py
similarity index 75%
rename from tests/test_layers/test_2p5d/checks_2p5d/common.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py
index aff85f109666..c90d8fc086bd 100644
--- a/tests/test_layers/test_2p5d/checks_2p5d/common.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py
@@ -11,4 +11,4 @@
 
 
 def check_equal(A, B):
-    assert torch.allclose(A, B, rtol=1e-5, atol=1e-2)
\ No newline at end of file
+    assert torch.allclose(A, B, rtol=1e-5, atol=1e-2)
diff --git a/tests/test_layers/test_2p5d/test_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py
similarity index 100%
rename from tests/test_layers/test_2p5d/test_2p5d.py
rename to tests/test_legacy/test_layers/test_2p5d/test_2p5d.py
diff --git a/tests/test_layers/test_3d/checks_3d/__init__.py b/tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py
similarity index 100%
rename from tests/test_layers/test_3d/checks_3d/__init__.py
rename to tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py
diff --git a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
similarity index 99%
rename from tests/test_layers/test_3d/checks_3d/check_layer_3d.py
rename to tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
index e946a1f5912d..cee639a9f00a 100644
--- a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py
+++ b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
@@ -7,8 +7,7 @@
 
 from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.core import global_context
-from colossalai.logging import get_dist_logger
-from colossalai.nn import (
+from colossalai.legacy.nn import (
     Classifier3D,
     CrossEntropyLoss3D,
     Embedding3D,
@@ -21,7 +20,8 @@
     VocabParallelCrossEntropyLoss3D,
     VocabParallelEmbedding3D,
 )
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device, print_rank_0
 
 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
diff --git a/tests/test_layers/test_3d/checks_3d/common.py b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py
similarity index 95%
rename from tests/test_layers/test_3d/checks_3d/common.py
rename to tests/test_legacy/test_layers/test_3d/checks_3d/common.py
index afb19c4745cc..509fc2cecf59 100644
--- a/tests/test_layers/test_3d/checks_3d/common.py
+++ b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py
@@ -16,4 +16,4 @@
 def check_equal(A, B):
     eq = torch.allclose(A, B, rtol=1e-3, atol=1e-2)
     assert eq, f"\nA = {A}\nB = {B}"
-    return eq
\ No newline at end of file
+    return eq
diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_legacy/test_layers/test_3d/test_3d.py
similarity index 100%
rename from tests/test_layers/test_3d/test_3d.py
rename to tests/test_legacy/test_layers/test_3d/test_3d.py
diff --git a/tests/test_layers/test_cache_embedding.py b/tests/test_legacy/test_layers/test_cache_embedding.py
similarity index 99%
rename from tests/test_layers/test_cache_embedding.py
rename to tests/test_legacy/test_layers/test_cache_embedding.py
index 22d4f02a48d7..0760a3f1ec38 100644
--- a/tests/test_layers/test_cache_embedding.py
+++ b/tests/test_legacy/test_layers/test_cache_embedding.py
@@ -6,7 +6,7 @@
 import torch
 
 import colossalai
-from colossalai.nn.parallel.layers import (
+from colossalai.legacy.nn.parallel.layers import (
     CachedEmbeddingBag,
     CachedParamMgr,
     EvictionStrategy,
diff --git a/tests/test_layers/test_sequence/checks_seq/__init__.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py
similarity index 100%
rename from tests/test_layers/test_sequence/checks_seq/__init__.py
rename to tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py
diff --git a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
similarity index 91%
rename from tests/test_layers/test_sequence/checks_seq/check_layer_seq.py
rename to tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
index 2b7b999d4373..7ff91a7b76e0 100644
--- a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py
+++ b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
@@ -2,7 +2,7 @@
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn import TransformerSelfAttentionRing
+from colossalai.legacy.nn import TransformerSelfAttentionRing
 from colossalai.utils import get_current_device
 
 
diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_legacy/test_layers/test_sequence/test_sequence.py
similarity index 97%
rename from tests/test_layers/test_sequence/test_sequence.py
rename to tests/test_legacy/test_layers/test_sequence/test_sequence.py
index 60f2d55f43af..b9e6c12479ee 100644
--- a/tests/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_legacy/test_layers/test_sequence/test_sequence.py
@@ -5,6 +5,7 @@
 import colossalai
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_sequence import RingAV, RingQK
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 CONFIG = dict(parallel=dict(tensor=dict(size=4, mode='sequence')))
@@ -42,7 +43,7 @@ def check_ring_qk(rank, world_size):
     a = torch.matmul(q, k.transpose(2, 1))
 
     # compute distributed attention scores
-    ring_qk = colossalai.nn.layer.parallel_sequence.RingQK.apply
+    ring_qk = RingQK.apply
     sub_a = ring_qk(sub_q, sub_k, batch_size, num_heads, sub_seq_length)
 
     # check master and distributed attention scores
@@ -95,7 +96,7 @@ def check_ring_av(rank, world_size):
     out = torch.matmul(a, v)
 
     # compute distributed attention scores
-    ring_av = colossalai.nn.layer.parallel_sequence.RingAV.apply
+    ring_av = RingAV.apply
     sub_out = ring_av(sub_a, sub_v, batch_size, num_heads, attention_head_size, sub_seq_length)
 
     # print(f'master output shape: {out.shape}, partial output shape: {sub_out.shape}')
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py
index 335be61359ed..9c3a7e2161d2 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py
@@ -7,7 +7,7 @@
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py
index 175d9ef6ceb9..03b2e4f2a9b2 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py
@@ -7,7 +7,7 @@
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
index 33cb3a65d184..cafffd0a6202 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
@@ -7,7 +7,7 @@
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py
index 73ac2dd5fe18..9b43be9e8cc5 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py
@@ -7,7 +7,7 @@
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch