From 5190ac0e1d820f47db2ae6ee119fb900e083348a Mon Sep 17 00:00:00 2001 From: ver217 Date: Thu, 7 Sep 2023 16:47:06 +0800 Subject: [PATCH] [legacy] move nn to legacy --- .../offload/base_offload_module.py | 2 +- colossalai/cli/benchmark/models.py | 2 +- colossalai/kernel/jit/option.py | 2 +- colossalai/legacy/nn/__init__.py | 4 + colossalai/{ => legacy}/nn/_ops/__init__.py | 0 colossalai/{ => legacy}/nn/_ops/_utils.py | 4 +- colossalai/{ => legacy}/nn/_ops/addmm.py | 0 colossalai/{ => legacy}/nn/_ops/batch_norm.py | 0 .../{ => legacy}/nn/_ops/element_wise.py | 0 colossalai/{ => legacy}/nn/_ops/embedding.py | 8 +- .../{ => legacy}/nn/_ops/embedding_bag.py | 8 +- colossalai/{ => legacy}/nn/_ops/layernorm.py | 5 +- colossalai/{ => legacy}/nn/_ops/linear.py | 0 colossalai/{ => legacy}/nn/_ops/loss.py | 9 +- colossalai/{ => legacy}/nn/_ops/view.py | 0 colossalai/legacy/nn/layer/__init__.py | 9 + .../{ => legacy}/nn/layer/base_layer.py | 0 .../nn/layer/colossalai_layer/__init__.py | 14 +- .../nn/layer/colossalai_layer/_utils.py | 0 .../nn/layer/colossalai_layer/dropout.py | 0 .../nn/layer/colossalai_layer/embedding.py | 303 +++++++++--------- .../nn/layer/colossalai_layer/linear.py | 2 +- .../layer/colossalai_layer/normalization.py | 83 ++--- .../legacy/nn/layer/parallel_1d/__init__.py | 17 + .../nn/layer/parallel_1d/_operation.py | 0 .../nn/layer/parallel_1d/_utils.py | 3 +- .../nn/layer/parallel_1d/layers.py | 0 .../nn/layer/parallel_2d/__init__.py | 11 +- .../nn/layer/parallel_2d/_operation.py | 0 .../nn/layer/parallel_2d/_utils.py | 0 .../nn/layer/parallel_2d/layers.py | 0 .../nn/layer/parallel_2p5d/__init__.py | 11 +- .../nn/layer/parallel_2p5d/_operation.py | 0 .../nn/layer/parallel_2p5d/_utils.py | 0 .../nn/layer/parallel_2p5d/layers.py | 0 .../nn/layer/parallel_3d/__init__.py | 11 +- .../nn/layer/parallel_3d/_operation.py | 0 .../nn/layer/parallel_3d/_utils.py | 0 .../nn/layer/parallel_3d/layers.py | 2 +- .../nn/layer/parallel_sequence/__init__.py | 2 +- .../nn/layer/parallel_sequence/_operation.py | 2 +- .../nn/layer/parallel_sequence/_utils.py | 0 .../nn/layer/parallel_sequence/layers.py | 2 +- colossalai/legacy/nn/layer/utils/__init__.py | 15 + .../{ => legacy}/nn/layer/utils/common.py | 3 +- .../{ => legacy}/nn/layer/vanilla/__init__.py | 0 .../{ => legacy}/nn/layer/vanilla/layers.py | 0 .../{ => legacy}/nn/layer/wrapper/__init__.py | 0 .../nn/layer/wrapper/pipeline_wrapper.py | 6 +- colossalai/legacy/nn/loss/__init__.py | 41 +++ colossalai/{ => legacy}/nn/loss/loss_1d.py | 0 colossalai/{ => legacy}/nn/loss/loss_2d.py | 4 +- colossalai/{ => legacy}/nn/loss/loss_2p5d.py | 4 +- colossalai/{ => legacy}/nn/loss/loss_3d.py | 4 +- colossalai/{ => legacy}/nn/metric/__init__.py | 54 ++-- colossalai/{ => legacy}/nn/metric/_utils.py | 14 +- .../{ => legacy}/nn/metric/accuracy_2d.py | 3 +- .../{ => legacy}/nn/metric/accuracy_2p5d.py | 3 +- .../{ => legacy}/nn/metric/accuracy_3d.py | 68 ++-- .../{ => legacy}/nn/parallel/__init__.py | 0 .../{ => legacy}/nn/parallel/data_parallel.py | 0 .../nn/parallel/layers/__init__.py | 17 +- .../layers/cache_embedding/__init__.py | 4 +- .../layers/cache_embedding/base_embedding.py | 1 + .../layers/cache_embedding/cache_mgr.py | 20 +- .../cache_embedding/cached_embedding.py | 11 +- .../parallel/layers/cache_embedding/copyer.py | 4 +- .../cache_embedding/embedding_config.py | 0 .../parallel_cached_embedding.py | 9 +- .../parallel_cached_embedding_tablewise.py | 13 +- ..._cached_embedding_tablewise_split_cache.py | 14 +- .../nn/parallel/layers/colo_module.py | 5 +- .../nn/parallel/layers/embedding.py | 3 +- .../{ => legacy}/nn/parallel/layers/linear.py | 3 +- .../nn/parallel/layers/module_utils.py | 8 +- .../{ => legacy}/nn/parallel/reducer.py | 0 colossalai/nn/__init__.py | 2 - colossalai/nn/layer/__init__.py | 8 - colossalai/nn/layer/parallel_1d/__init__.py | 7 - colossalai/nn/layer/utils.py | 14 + colossalai/nn/layer/utils/__init__.py | 7 - colossalai/nn/loss/__init__.py | 40 --- colossalai/pipeline/pipelinable.py | 25 +- colossalai/pipeline/utils.py | 11 +- colossalai/tensor/dist_spec_mgr.py | 1 - colossalai/utils/__init__.py | 4 + colossalai/utils/common.py | 19 ++ colossalai/zero/gemini/colo_init_context.py | 2 +- colossalai/zero/gemini/gemini_ddp.py | 8 +- .../memory_tracer/runtime_mem_tracer.py | 2 +- ...parallelize_your_training_like_Megatron.md | 2 +- .../train_gpt_using_hybrid_parallelism.md | 2 +- .../train_vit_with_hybrid_parallelism.md | 2 +- docs/source/en/basics/engine_trainer.md | 2 +- ...parallelize_your_training_like_Megatron.md | 2 +- .../train_gpt_using_hybrid_parallelism.md | 2 +- .../train_vit_with_hybrid_parallelism.md | 2 +- docs/source/zh-Hans/basics/engine_trainer.md | 2 +- examples/language/gpt/titans/model/embed.py | 8 +- examples/language/gpt/titans/model/gpt1d.py | 6 +- .../gpt/titans/model/pipeline_gpt1d.py | 2 +- examples/tutorial/hybrid_parallel/train.py | 2 +- .../tutorial/sequence_parallel/model/bert.py | 60 ++-- .../model/layers/bert_layer.py | 24 +- .../components_to_test/hanging_param_model.py | 2 +- tests/components_to_test/inline_op_model.py | 2 +- tests/components_to_test/nested_model.py | 2 +- .../repeated_computed_layers.py | 2 +- tests/components_to_test/simple_net.py | 2 +- .../test_layers/test_1d/checks_1d/__init__.py | 0 .../test_1d/checks_1d/check_layer_1d.py | 2 +- .../test_layers/test_1d/checks_1d/common.py | 31 +- .../test_layers/test_1d/test_1d.py | 0 .../test_layers/test_2d/checks_2d/__init__.py | 0 .../test_2d/checks_2d/check_layer_2d.py | 25 +- .../test_2d/checks_2d/check_operation_2d.py | 8 +- .../test_layers/test_2d/checks_2d/common.py | 0 .../test_layers/test_2d/test_2d.py | 0 .../test_2p5d/checks_2p5d/__init__.py | 0 .../test_2p5d/checks_2p5d/check_layer_2p5d.py | 25 +- .../checks_2p5d/check_operation_2p5d.py | 7 +- .../test_2p5d/checks_2p5d/common.py | 2 +- .../test_layers/test_2p5d/test_2p5d.py | 0 .../test_layers/test_3d/checks_3d/__init__.py | 0 .../test_3d/checks_3d/check_layer_3d.py | 6 +- .../test_layers/test_3d/checks_3d/common.py | 2 +- .../test_layers/test_3d/test_3d.py | 0 .../test_layers/test_cache_embedding.py | 2 +- .../test_sequence/checks_seq/__init__.py | 0 .../checks_seq/check_layer_seq.py | 2 +- .../test_sequence/test_sequence.py | 5 +- .../test_checkpoint/test_checkpoint_1d.py | 2 +- .../test_checkpoint/test_checkpoint_2d.py | 2 +- .../test_checkpoint/test_checkpoint_2p5d.py | 2 +- .../test_checkpoint/test_checkpoint_3d.py | 2 +- 135 files changed, 697 insertions(+), 553 deletions(-) create mode 100644 colossalai/legacy/nn/__init__.py rename colossalai/{ => legacy}/nn/_ops/__init__.py (100%) rename colossalai/{ => legacy}/nn/_ops/_utils.py (99%) rename colossalai/{ => legacy}/nn/_ops/addmm.py (100%) rename colossalai/{ => legacy}/nn/_ops/batch_norm.py (100%) rename colossalai/{ => legacy}/nn/_ops/element_wise.py (100%) rename colossalai/{ => legacy}/nn/_ops/embedding.py (98%) rename colossalai/{ => legacy}/nn/_ops/embedding_bag.py (97%) rename colossalai/{ => legacy}/nn/_ops/layernorm.py (92%) rename colossalai/{ => legacy}/nn/_ops/linear.py (100%) rename colossalai/{ => legacy}/nn/_ops/loss.py (96%) rename colossalai/{ => legacy}/nn/_ops/view.py (100%) create mode 100644 colossalai/legacy/nn/layer/__init__.py rename colossalai/{ => legacy}/nn/layer/base_layer.py (100%) rename colossalai/{ => legacy}/nn/layer/colossalai_layer/__init__.py (97%) rename colossalai/{ => legacy}/nn/layer/colossalai_layer/_utils.py (100%) rename colossalai/{ => legacy}/nn/layer/colossalai_layer/dropout.py (100%) rename colossalai/{ => legacy}/nn/layer/colossalai_layer/embedding.py (97%) rename colossalai/{ => legacy}/nn/layer/colossalai_layer/linear.py (99%) rename colossalai/{ => legacy}/nn/layer/colossalai_layer/normalization.py (97%) create mode 100644 colossalai/legacy/nn/layer/parallel_1d/__init__.py rename colossalai/{ => legacy}/nn/layer/parallel_1d/_operation.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_1d/_utils.py (99%) rename colossalai/{ => legacy}/nn/layer/parallel_1d/layers.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_2d/__init__.py (59%) rename colossalai/{ => legacy}/nn/layer/parallel_2d/_operation.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_2d/_utils.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_2d/layers.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/__init__.py (59%) rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/_operation.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/_utils.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_2p5d/layers.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_3d/__init__.py (62%) rename colossalai/{ => legacy}/nn/layer/parallel_3d/_operation.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_3d/_utils.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_3d/layers.py (99%) rename colossalai/{ => legacy}/nn/layer/parallel_sequence/__init__.py (74%) rename colossalai/{ => legacy}/nn/layer/parallel_sequence/_operation.py (98%) rename colossalai/{ => legacy}/nn/layer/parallel_sequence/_utils.py (100%) rename colossalai/{ => legacy}/nn/layer/parallel_sequence/layers.py (99%) create mode 100644 colossalai/legacy/nn/layer/utils/__init__.py rename colossalai/{ => legacy}/nn/layer/utils/common.py (99%) rename colossalai/{ => legacy}/nn/layer/vanilla/__init__.py (100%) rename colossalai/{ => legacy}/nn/layer/vanilla/layers.py (100%) rename colossalai/{ => legacy}/nn/layer/wrapper/__init__.py (100%) rename colossalai/{ => legacy}/nn/layer/wrapper/pipeline_wrapper.py (99%) create mode 100644 colossalai/legacy/nn/loss/__init__.py rename colossalai/{ => legacy}/nn/loss/loss_1d.py (100%) rename colossalai/{ => legacy}/nn/loss/loss_2d.py (97%) rename colossalai/{ => legacy}/nn/loss/loss_2p5d.py (96%) rename colossalai/{ => legacy}/nn/loss/loss_3d.py (97%) rename colossalai/{ => legacy}/nn/metric/__init__.py (87%) rename colossalai/{ => legacy}/nn/metric/_utils.py (95%) rename colossalai/{ => legacy}/nn/metric/accuracy_2d.py (89%) rename colossalai/{ => legacy}/nn/metric/accuracy_2p5d.py (88%) rename colossalai/{ => legacy}/nn/metric/accuracy_3d.py (85%) rename colossalai/{ => legacy}/nn/parallel/__init__.py (100%) rename colossalai/{ => legacy}/nn/parallel/data_parallel.py (100%) rename colossalai/{ => legacy}/nn/parallel/layers/__init__.py (56%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/__init__.py (100%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/base_embedding.py (99%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/cache_mgr.py (99%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/cached_embedding.py (98%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/copyer.py (97%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/embedding_config.py (100%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py (96%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py (99%) rename colossalai/{ => legacy}/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py (99%) rename colossalai/{ => legacy}/nn/parallel/layers/colo_module.py (98%) rename colossalai/{ => legacy}/nn/parallel/layers/embedding.py (92%) rename colossalai/{ => legacy}/nn/parallel/layers/linear.py (93%) rename colossalai/{ => legacy}/nn/parallel/layers/module_utils.py (99%) rename colossalai/{ => legacy}/nn/parallel/reducer.py (100%) delete mode 100644 colossalai/nn/layer/parallel_1d/__init__.py create mode 100644 colossalai/nn/layer/utils.py delete mode 100644 colossalai/nn/layer/utils/__init__.py rename tests/{ => test_legacy}/test_layers/test_1d/checks_1d/__init__.py (100%) rename tests/{ => test_legacy}/test_layers/test_1d/checks_1d/check_layer_1d.py (99%) rename tests/{ => test_legacy}/test_layers/test_1d/checks_1d/common.py (94%) rename tests/{ => test_legacy}/test_layers/test_1d/test_1d.py (100%) rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/__init__.py (100%) rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/check_layer_2d.py (97%) rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/check_operation_2d.py (96%) rename tests/{ => test_legacy}/test_layers/test_2d/checks_2d/common.py (100%) rename tests/{ => test_legacy}/test_layers/test_2d/test_2d.py (100%) rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/__init__.py (100%) rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py (98%) rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py (97%) rename tests/{ => test_legacy}/test_layers/test_2p5d/checks_2p5d/common.py (75%) rename tests/{ => test_legacy}/test_layers/test_2p5d/test_2p5d.py (100%) rename tests/{ => test_legacy}/test_layers/test_3d/checks_3d/__init__.py (100%) rename tests/{ => test_legacy}/test_layers/test_3d/checks_3d/check_layer_3d.py (99%) rename tests/{ => test_legacy}/test_layers/test_3d/checks_3d/common.py (95%) rename tests/{ => test_legacy}/test_layers/test_3d/test_3d.py (100%) rename tests/{ => test_legacy}/test_layers/test_cache_embedding.py (99%) rename tests/{ => test_legacy}/test_layers/test_sequence/checks_seq/__init__.py (100%) rename tests/{ => test_legacy}/test_layers/test_sequence/checks_seq/check_layer_seq.py (91%) rename tests/{ => test_legacy}/test_layers/test_sequence/test_sequence.py (97%) diff --git a/colossalai/auto_parallel/offload/base_offload_module.py b/colossalai/auto_parallel/offload/base_offload_module.py index d0c328e134ff..5b9f74b132f3 100644 --- a/colossalai/auto_parallel/offload/base_offload_module.py +++ b/colossalai/auto_parallel/offload/base_offload_module.py @@ -4,7 +4,7 @@ import torch import torch.nn as nn -from colossalai.nn.parallel.data_parallel import _cast_float +from colossalai.utils import _cast_float from colossalai.zero.legacy.gemini.tensor_utils import free_storage from .region_manager import RegionManager diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py index f8fd1c41a059..385b485b6016 100644 --- a/colossalai/cli/benchmark/models.py +++ b/colossalai/cli/benchmark/models.py @@ -1,6 +1,6 @@ import torch -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn class MLP(torch.nn.Module): diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py index e20c08b051ed..8eb4e0c880a0 100644 --- a/colossalai/kernel/jit/option.py +++ b/colossalai/kernel/jit/option.py @@ -1,6 +1,6 @@ import torch -from colossalai.nn.layer.colossalai_layer import Embedding, Linear +from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear from colossalai.utils import get_current_device from .bias_dropout_add import bias_dropout_add_fused_train diff --git a/colossalai/legacy/nn/__init__.py b/colossalai/legacy/nn/__init__.py new file mode 100644 index 000000000000..500162901905 --- /dev/null +++ b/colossalai/legacy/nn/__init__.py @@ -0,0 +1,4 @@ +from ._ops import * +from .layer import * +from .loss import * +from .metric import * diff --git a/colossalai/nn/_ops/__init__.py b/colossalai/legacy/nn/_ops/__init__.py similarity index 100% rename from colossalai/nn/_ops/__init__.py rename to colossalai/legacy/nn/_ops/__init__.py diff --git a/colossalai/nn/_ops/_utils.py b/colossalai/legacy/nn/_ops/_utils.py similarity index 99% rename from colossalai/nn/_ops/_utils.py rename to colossalai/legacy/nn/_ops/_utils.py index 24877bbb552f..131c2154771b 100644 --- a/colossalai/nn/_ops/_utils.py +++ b/colossalai/legacy/nn/_ops/_utils.py @@ -4,7 +4,7 @@ import torch.distributed as dist from colossalai.global_variables import tensor_parallel_env as env -from colossalai.nn.layer.utils import divide +from colossalai.legacy.nn.layer.utils import divide from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup GeneralTensor = Union[ColoTensor, torch.Tensor] @@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int): return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim) -### table wise embedding shard +# table wise embedding shard def _all_to_all_for_tablewise(x: torch.Tensor, diff --git a/colossalai/nn/_ops/addmm.py b/colossalai/legacy/nn/_ops/addmm.py similarity index 100% rename from colossalai/nn/_ops/addmm.py rename to colossalai/legacy/nn/_ops/addmm.py diff --git a/colossalai/nn/_ops/batch_norm.py b/colossalai/legacy/nn/_ops/batch_norm.py similarity index 100% rename from colossalai/nn/_ops/batch_norm.py rename to colossalai/legacy/nn/_ops/batch_norm.py diff --git a/colossalai/nn/_ops/element_wise.py b/colossalai/legacy/nn/_ops/element_wise.py similarity index 100% rename from colossalai/nn/_ops/element_wise.py rename to colossalai/legacy/nn/_ops/element_wise.py diff --git a/colossalai/nn/_ops/embedding.py b/colossalai/legacy/nn/_ops/embedding.py similarity index 98% rename from colossalai/nn/_ops/embedding.py rename to colossalai/legacy/nn/_ops/embedding.py index a045f305b5dc..b145d1763380 100644 --- a/colossalai/nn/_ops/embedding.py +++ b/colossalai/legacy/nn/_ops/embedding.py @@ -1,8 +1,10 @@ -import torch.nn.functional as F from typing import Optional + +import torch.nn.functional as F + +from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec from colossalai.tensor.op_wrapper import colo_op_impl -from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \ - ReplicaSpec + from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input diff --git a/colossalai/nn/_ops/embedding_bag.py b/colossalai/legacy/nn/_ops/embedding_bag.py similarity index 97% rename from colossalai/nn/_ops/embedding_bag.py rename to colossalai/legacy/nn/_ops/embedding_bag.py index 0026f579b6dc..9a656d5871a3 100644 --- a/colossalai/nn/_ops/embedding_bag.py +++ b/colossalai/legacy/nn/_ops/embedding_bag.py @@ -1,9 +1,11 @@ -import torch.nn.functional as F from typing import Optional + +import torch.nn.functional as F from torch import Tensor + +from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec from colossalai.tensor.op_wrapper import colo_op_impl -from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \ - ShardSpec, ReplicaSpec + from ._utils import GeneralTensor, convert_to_colo_tensor diff --git a/colossalai/nn/_ops/layernorm.py b/colossalai/legacy/nn/_ops/layernorm.py similarity index 92% rename from colossalai/nn/_ops/layernorm.py rename to colossalai/legacy/nn/_ops/layernorm.py index 2b761b84e3ee..9960c5d48096 100644 --- a/colossalai/nn/_ops/layernorm.py +++ b/colossalai/legacy/nn/_ops/layernorm.py @@ -1,7 +1,10 @@ from typing import List, Optional + import torch.nn.functional as F + +from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec from colossalai.tensor.op_wrapper import colo_op_impl -from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec + from ._utils import GeneralTensor, convert_to_colo_tensor diff --git a/colossalai/nn/_ops/linear.py b/colossalai/legacy/nn/_ops/linear.py similarity index 100% rename from colossalai/nn/_ops/linear.py rename to colossalai/legacy/nn/_ops/linear.py diff --git a/colossalai/nn/_ops/loss.py b/colossalai/legacy/nn/_ops/loss.py similarity index 96% rename from colossalai/nn/_ops/loss.py rename to colossalai/legacy/nn/_ops/loss.py index 1e54f662859c..90efbfa36f2a 100644 --- a/colossalai/nn/_ops/loss.py +++ b/colossalai/legacy/nn/_ops/loss.py @@ -1,9 +1,12 @@ +from typing import Optional + import torch import torch.nn.functional as F -from typing import Optional -from colossalai.tensor.op_wrapper import colo_op_impl + +from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D from colossalai.tensor import ColoTensor, ColoTensorSpec -from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D +from colossalai.tensor.op_wrapper import colo_op_impl + from ._utils import GeneralTensor, convert_to_colo_tensor diff --git a/colossalai/nn/_ops/view.py b/colossalai/legacy/nn/_ops/view.py similarity index 100% rename from colossalai/nn/_ops/view.py rename to colossalai/legacy/nn/_ops/view.py diff --git a/colossalai/legacy/nn/layer/__init__.py b/colossalai/legacy/nn/layer/__init__.py new file mode 100644 index 000000000000..86961dd933a7 --- /dev/null +++ b/colossalai/legacy/nn/layer/__init__.py @@ -0,0 +1,9 @@ +from .colossalai_layer import * +from .parallel_1d import * +from .parallel_2d import * +from .parallel_2p5d import * +from .parallel_3d import * +from .parallel_sequence import * +from .utils import * +from .vanilla import * +from .wrapper import * diff --git a/colossalai/nn/layer/base_layer.py b/colossalai/legacy/nn/layer/base_layer.py similarity index 100% rename from colossalai/nn/layer/base_layer.py rename to colossalai/legacy/nn/layer/base_layer.py diff --git a/colossalai/nn/layer/colossalai_layer/__init__.py b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py similarity index 97% rename from colossalai/nn/layer/colossalai_layer/__init__.py rename to colossalai/legacy/nn/layer/colossalai_layer/__init__.py index 2ae1b07a75b2..ed743820ddbc 100644 --- a/colossalai/nn/layer/colossalai_layer/__init__.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py @@ -1,7 +1,7 @@ -from ._utils import partition_batch -from .dropout import Dropout -from .embedding import Embedding, PatchEmbedding -from .linear import Classifier, Linear -from .normalization import LayerNorm - -__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch'] +from ._utils import partition_batch +from .dropout import Dropout +from .embedding import Embedding, PatchEmbedding +from .linear import Classifier, Linear +from .normalization import LayerNorm + +__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch'] diff --git a/colossalai/nn/layer/colossalai_layer/_utils.py b/colossalai/legacy/nn/layer/colossalai_layer/_utils.py similarity index 100% rename from colossalai/nn/layer/colossalai_layer/_utils.py rename to colossalai/legacy/nn/layer/colossalai_layer/_utils.py diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py similarity index 100% rename from colossalai/nn/layer/colossalai_layer/dropout.py rename to colossalai/legacy/nn/layer/colossalai_layer/dropout.py diff --git a/colossalai/nn/layer/colossalai_layer/embedding.py b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py similarity index 97% rename from colossalai/nn/layer/colossalai_layer/embedding.py rename to colossalai/legacy/nn/layer/colossalai_layer/embedding.py index e5c9c46e0ff1..28bcb7ffefb0 100644 --- a/colossalai/nn/layer/colossalai_layer/embedding.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py @@ -1,151 +1,152 @@ -import math -from typing import Callable - -from colossalai.utils import get_current_device -from torch import dtype, nn - -from ... import init as init -from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D -from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D -from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D -from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D -from ..utils import get_tensor_parallel_mode -from ..vanilla import VanillaPatchEmbedding -from ._utils import ColossalaiModule - -_parallel_embedding = { - '1d': Embedding1D, - '2d': Embedding2D, - '2.5d': Embedding2p5D, - '3d': Embedding3D, -} - -_vocab_parallel_embedding = { - '1d': VocabParallelEmbedding1D, - '2d': VocabParallelEmbedding2D, - '2.5d': VocabParallelEmbedding2p5D, - '3d': VocabParallelEmbedding3D -} - -_parallel_patchembedding = { - None: VanillaPatchEmbedding, - '1d': PatchEmbedding1D, - '2d': PatchEmbedding2D, - '2.5d': PatchEmbedding2p5D, - '3d': PatchEmbedding3D -} - - -class Embedding(ColossalaiModule): - r"""Embedding for colossalai. - - Args: - num_embeddings (int): number of embeddings. - embedding_dim (int): dimension of embedding. - padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; - therefore, the embedding vector at padding_idx is not updated during training, - i.e. it remains as a fixed “pad”, defaults to None. - dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. - weight_initializer (:class:`typing.Callable`, optional): - he initializer of weight, defaults to normal initializer. - - The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain: - :: - - max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is - renormalized to have norm max_norm. Note: this will modify weight in-place. - norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2. - scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse - of frequency of the words in the mini-batch. Default False. - sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False. - - More details about ``args`` and ``kwargs`` could be found in - `Embedding `_. - - More details about ``initializer`` please refer to - `init `_ - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - padding_idx: int = None, - dtype: dtype = None, - weight_initializer: Callable = init.normal_(), - vocab_parallel_limit: int = 2048, - *args, - **kwargs) -> None: - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel is None: - embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args, - **kwargs).to(dtype).to(get_current_device()) - weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim) - elif num_embeddings <= vocab_parallel_limit: - embed = _parallel_embedding[tensor_parallel]( - num_embeddings, - embedding_dim, - padding_idx=padding_idx, - dtype=dtype, - weight_initializer=weight_initializer, - *args, - **kwargs, - ) - else: - embed = _vocab_parallel_embedding[tensor_parallel]( - num_embeddings, - embedding_dim, - padding_idx=padding_idx, - dtype=dtype, - weight_initializer=weight_initializer, - *args, - **kwargs, - ) - super().__init__(embed) - - -class PatchEmbedding(ColossalaiModule): - """2D Image to Patch Embedding. - - Args: - img_size (int): image size. - patch_size (int): patch size. - in_chans (int): number of channels of input image. - embed_size (int): size of embedding. - dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. - flatten (bool, optional): whether to flatten output tensor, defaults to True. - weight_initializer (:class:`typing.Callable`, optional): - The initializer of weight, defaults to kaiming uniform initializer. - bias_initializer (:class:`typing.Callable`, optional): - The initializer of bias, defaults to xavier uniform initializer. - position_embed_initializer (:class:`typing.Callable`, optional): - The initializer of position embedding, defaults to zeros initializer. - - More details about ``initializer`` please refer to - `init `_. - """ - - def __init__( - self, - img_size: int, - patch_size: int, - in_chans: int, - embed_size: int, - dtype: dtype = None, - flatten: bool = True, - weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), - bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), - position_embed_initializer: Callable = init.zeros_() - ) -> None: - tensor_parallel = get_tensor_parallel_mode() - embed = _parallel_patchembedding[tensor_parallel]( - img_size, - patch_size, - in_chans, - embed_size, - dtype=dtype, - flatten=flatten, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - position_embed_initializer=position_embed_initializer, - ) - super().__init__(embed) +import math +from typing import Callable + +from torch import dtype, nn + +from colossalai.nn import init +from colossalai.utils import get_current_device + +from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D +from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D +from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D +from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D +from ..utils import get_tensor_parallel_mode +from ..vanilla import VanillaPatchEmbedding +from ._utils import ColossalaiModule + +_parallel_embedding = { + '1d': Embedding1D, + '2d': Embedding2D, + '2.5d': Embedding2p5D, + '3d': Embedding3D, +} + +_vocab_parallel_embedding = { + '1d': VocabParallelEmbedding1D, + '2d': VocabParallelEmbedding2D, + '2.5d': VocabParallelEmbedding2p5D, + '3d': VocabParallelEmbedding3D +} + +_parallel_patchembedding = { + None: VanillaPatchEmbedding, + '1d': PatchEmbedding1D, + '2d': PatchEmbedding2D, + '2.5d': PatchEmbedding2p5D, + '3d': PatchEmbedding3D +} + + +class Embedding(ColossalaiModule): + r"""Embedding for colossalai. + + Args: + num_embeddings (int): number of embeddings. + embedding_dim (int): dimension of embedding. + padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; + therefore, the embedding vector at padding_idx is not updated during training, + i.e. it remains as a fixed “pad”, defaults to None. + dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. + weight_initializer (:class:`typing.Callable`, optional): + he initializer of weight, defaults to normal initializer. + + The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain: + :: + + max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is + renormalized to have norm max_norm. Note: this will modify weight in-place. + norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2. + scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse + of frequency of the words in the mini-batch. Default False. + sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False. + + More details about ``args`` and ``kwargs`` could be found in + `Embedding `_. + + More details about ``initializer`` please refer to + `init `_ + """ + + def __init__(self, + num_embeddings: int, + embedding_dim: int, + padding_idx: int = None, + dtype: dtype = None, + weight_initializer: Callable = init.normal_(), + vocab_parallel_limit: int = 2048, + *args, + **kwargs) -> None: + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel is None: + embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args, + **kwargs).to(dtype).to(get_current_device()) + weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim) + elif num_embeddings <= vocab_parallel_limit: + embed = _parallel_embedding[tensor_parallel]( + num_embeddings, + embedding_dim, + padding_idx=padding_idx, + dtype=dtype, + weight_initializer=weight_initializer, + *args, + **kwargs, + ) + else: + embed = _vocab_parallel_embedding[tensor_parallel]( + num_embeddings, + embedding_dim, + padding_idx=padding_idx, + dtype=dtype, + weight_initializer=weight_initializer, + *args, + **kwargs, + ) + super().__init__(embed) + + +class PatchEmbedding(ColossalaiModule): + """2D Image to Patch Embedding. + + Args: + img_size (int): image size. + patch_size (int): patch size. + in_chans (int): number of channels of input image. + embed_size (int): size of embedding. + dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. + flatten (bool, optional): whether to flatten output tensor, defaults to True. + weight_initializer (:class:`typing.Callable`, optional): + The initializer of weight, defaults to kaiming uniform initializer. + bias_initializer (:class:`typing.Callable`, optional): + The initializer of bias, defaults to xavier uniform initializer. + position_embed_initializer (:class:`typing.Callable`, optional): + The initializer of position embedding, defaults to zeros initializer. + + More details about ``initializer`` please refer to + `init `_. + """ + + def __init__( + self, + img_size: int, + patch_size: int, + in_chans: int, + embed_size: int, + dtype: dtype = None, + flatten: bool = True, + weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), + bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), + position_embed_initializer: Callable = init.zeros_() + ) -> None: + tensor_parallel = get_tensor_parallel_mode() + embed = _parallel_patchembedding[tensor_parallel]( + img_size, + patch_size, + in_chans, + embed_size, + dtype=dtype, + flatten=flatten, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + position_embed_initializer=position_embed_initializer, + ) + super().__init__(embed) diff --git a/colossalai/nn/layer/colossalai_layer/linear.py b/colossalai/legacy/nn/layer/colossalai_layer/linear.py similarity index 99% rename from colossalai/nn/layer/colossalai_layer/linear.py rename to colossalai/legacy/nn/layer/colossalai_layer/linear.py index 3e0c6e285c1c..c05ceb66ce25 100644 --- a/colossalai/nn/layer/colossalai_layer/linear.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/linear.py @@ -4,9 +4,9 @@ from torch import dtype, nn +from colossalai.nn import init from colossalai.utils import get_current_device -from ... import init as init from ..parallel_1d import * from ..parallel_2d import * from ..parallel_2p5d import * diff --git a/colossalai/nn/layer/colossalai_layer/normalization.py b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py similarity index 97% rename from colossalai/nn/layer/colossalai_layer/normalization.py rename to colossalai/legacy/nn/layer/colossalai_layer/normalization.py index 86861d30214a..f8e317e723f1 100644 --- a/colossalai/nn/layer/colossalai_layer/normalization.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py @@ -1,41 +1,42 @@ -from colossalai.utils import get_current_device -from torch import nn - -from ..parallel_1d import LayerNorm1D -from ..parallel_2d import LayerNorm2D -from ..parallel_2p5d import LayerNorm2p5D -from ..parallel_3d import LayerNorm3D -from ..utils import get_tensor_parallel_mode -from ..vanilla import VanillaLayerNorm -from ._utils import ColossalaiModule - -_parallel_layernorm = { - None: VanillaLayerNorm, - "1d": LayerNorm1D, - "2d": LayerNorm2D, - "2.5d": LayerNorm2p5D, - "3d": LayerNorm3D, -} - - -class LayerNorm(ColossalaiModule): - r"""Layer Normalization for colossalai. - - Args: - normalized_shape (int): input shape from an expected input of size. - :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] - \times \ldots \times \text{normalized_shape}[-1]]` - If a single integer is used, it is treated as a singleton list, and this module will - normalize over the last dimension which is expected to be of that specific size. - eps (float): a value added to the denominator for numerical stability, defaults to 1e-05. - bias (bool, optional): Whether to add a bias, defaults to ``True``. - dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. - """ - - def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None: - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel is None: - norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device()) - else: - norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype) - super().__init__(norm) +from torch import nn + +from colossalai.utils import get_current_device + +from ..parallel_1d import LayerNorm1D +from ..parallel_2d import LayerNorm2D +from ..parallel_2p5d import LayerNorm2p5D +from ..parallel_3d import LayerNorm3D +from ..utils import get_tensor_parallel_mode +from ..vanilla import VanillaLayerNorm +from ._utils import ColossalaiModule + +_parallel_layernorm = { + None: VanillaLayerNorm, + "1d": LayerNorm1D, + "2d": LayerNorm2D, + "2.5d": LayerNorm2p5D, + "3d": LayerNorm3D, +} + + +class LayerNorm(ColossalaiModule): + r"""Layer Normalization for colossalai. + + Args: + normalized_shape (int): input shape from an expected input of size. + :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] + \times \ldots \times \text{normalized_shape}[-1]]` + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + eps (float): a value added to the denominator for numerical stability, defaults to 1e-05. + bias (bool, optional): Whether to add a bias, defaults to ``True``. + dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. + """ + + def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None: + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel is None: + norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device()) + else: + norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype) + super().__init__(norm) diff --git a/colossalai/legacy/nn/layer/parallel_1d/__init__.py b/colossalai/legacy/nn/layer/parallel_1d/__init__.py new file mode 100644 index 000000000000..9cffd4d339f5 --- /dev/null +++ b/colossalai/legacy/nn/layer/parallel_1d/__init__.py @@ -0,0 +1,17 @@ +from .layers import ( + Classifier1D, + Dropout1D, + Embedding1D, + LayerNorm1D, + Linear1D, + Linear1D_Col, + Linear1D_Row, + PatchEmbedding1D, + VocabParallelClassifier1D, + VocabParallelEmbedding1D, +) + +__all__ = [ + 'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D', + 'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D' +] diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/legacy/nn/layer/parallel_1d/_operation.py similarity index 100% rename from colossalai/nn/layer/parallel_1d/_operation.py rename to colossalai/legacy/nn/layer/parallel_1d/_operation.py diff --git a/colossalai/nn/layer/parallel_1d/_utils.py b/colossalai/legacy/nn/layer/parallel_1d/_utils.py similarity index 99% rename from colossalai/nn/layer/parallel_1d/_utils.py rename to colossalai/legacy/nn/layer/parallel_1d/_utils.py index 1212d595635d..fddf4e73db51 100644 --- a/colossalai/nn/layer/parallel_1d/_utils.py +++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py @@ -3,6 +3,7 @@ import torch import torch.distributed as dist + from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env @@ -124,7 +125,7 @@ def backward(ctx, grad_output): class _SplitForwardGatherBackward(torch.autograd.Function): """ Split the input and keep only the corresponding chuck to the rank. - + Args: input_: input matrix. parallel_mode: parallel mode. diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/legacy/nn/layer/parallel_1d/layers.py similarity index 100% rename from colossalai/nn/layer/parallel_1d/layers.py rename to colossalai/legacy/nn/layer/parallel_1d/layers.py diff --git a/colossalai/nn/layer/parallel_2d/__init__.py b/colossalai/legacy/nn/layer/parallel_2d/__init__.py similarity index 59% rename from colossalai/nn/layer/parallel_2d/__init__.py rename to colossalai/legacy/nn/layer/parallel_2d/__init__.py index 5562d1a70036..9c65f3608710 100644 --- a/colossalai/nn/layer/parallel_2d/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_2d/__init__.py @@ -1,6 +1,13 @@ from ._operation import reduce_by_batch_2d, split_batch_2d -from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D, - VocabParallelEmbedding2D) +from .layers import ( + Classifier2D, + Embedding2D, + LayerNorm2D, + Linear2D, + PatchEmbedding2D, + VocabParallelClassifier2D, + VocabParallelEmbedding2D, +) __all__ = [ 'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D', diff --git a/colossalai/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py similarity index 100% rename from colossalai/nn/layer/parallel_2d/_operation.py rename to colossalai/legacy/nn/layer/parallel_2d/_operation.py diff --git a/colossalai/nn/layer/parallel_2d/_utils.py b/colossalai/legacy/nn/layer/parallel_2d/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_2d/_utils.py rename to colossalai/legacy/nn/layer/parallel_2d/_utils.py diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/legacy/nn/layer/parallel_2d/layers.py similarity index 100% rename from colossalai/nn/layer/parallel_2d/layers.py rename to colossalai/legacy/nn/layer/parallel_2d/layers.py diff --git a/colossalai/nn/layer/parallel_2p5d/__init__.py b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py similarity index 59% rename from colossalai/nn/layer/parallel_2p5d/__init__.py rename to colossalai/legacy/nn/layer/parallel_2p5d/__init__.py index bec3b1c4b0b8..23e47e6ed06b 100644 --- a/colossalai/nn/layer/parallel_2p5d/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py @@ -1,6 +1,13 @@ from ._operation import reduce_by_batch_2p5d, split_batch_2p5d -from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D, - VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D) +from .layers import ( + Classifier2p5D, + Embedding2p5D, + LayerNorm2p5D, + Linear2p5D, + PatchEmbedding2p5D, + VocabParallelClassifier2p5D, + VocabParallelEmbedding2p5D, +) __all__ = [ 'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D', diff --git a/colossalai/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py similarity index 100% rename from colossalai/nn/layer/parallel_2p5d/_operation.py rename to colossalai/legacy/nn/layer/parallel_2p5d/_operation.py diff --git a/colossalai/nn/layer/parallel_2p5d/_utils.py b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_2p5d/_utils.py rename to colossalai/legacy/nn/layer/parallel_2p5d/_utils.py diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py similarity index 100% rename from colossalai/nn/layer/parallel_2p5d/layers.py rename to colossalai/legacy/nn/layer/parallel_2p5d/layers.py diff --git a/colossalai/nn/layer/parallel_3d/__init__.py b/colossalai/legacy/nn/layer/parallel_3d/__init__.py similarity index 62% rename from colossalai/nn/layer/parallel_3d/__init__.py rename to colossalai/legacy/nn/layer/parallel_3d/__init__.py index 9ae255b449ee..17fe8403c585 100644 --- a/colossalai/nn/layer/parallel_3d/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_3d/__init__.py @@ -1,6 +1,13 @@ from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d -from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D, - VocabParallelEmbedding3D) +from .layers import ( + Classifier3D, + Embedding3D, + LayerNorm3D, + Linear3D, + PatchEmbedding3D, + VocabParallelClassifier3D, + VocabParallelEmbedding3D, +) __all__ = [ 'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D', diff --git a/colossalai/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py similarity index 100% rename from colossalai/nn/layer/parallel_3d/_operation.py rename to colossalai/legacy/nn/layer/parallel_3d/_operation.py diff --git a/colossalai/nn/layer/parallel_3d/_utils.py b/colossalai/legacy/nn/layer/parallel_3d/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_3d/_utils.py rename to colossalai/legacy/nn/layer/parallel_3d/_utils.py diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/legacy/nn/layer/parallel_3d/layers.py similarity index 99% rename from colossalai/nn/layer/parallel_3d/layers.py rename to colossalai/legacy/nn/layer/parallel_3d/layers.py index 2861b53013e1..b815a842ca52 100644 --- a/colossalai/nn/layer/parallel_3d/layers.py +++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py @@ -13,9 +13,9 @@ from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env from colossalai.legacy.communication import all_reduce, broadcast +from colossalai.legacy.nn.layer.base_layer import ParallelLayer from colossalai.legacy.registry import LAYERS from colossalai.nn import init as init -from colossalai.nn.layer.base_layer import ParallelLayer from colossalai.utils.checkpointing import ( broadcast_state_dict, gather_tensor_parallel_state_dict, diff --git a/colossalai/nn/layer/parallel_sequence/__init__.py b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py similarity index 74% rename from colossalai/nn/layer/parallel_sequence/__init__.py rename to colossalai/legacy/nn/layer/parallel_sequence/__init__.py index 4fa9eed6f34b..d92d66d40a8e 100644 --- a/colossalai/nn/layer/parallel_sequence/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py @@ -1,4 +1,4 @@ -from ._operation import RingQK, RingAV +from ._operation import RingAV, RingQK from .layers import TransformerSelfAttentionRing __all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK'] diff --git a/colossalai/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py similarity index 98% rename from colossalai/nn/layer/parallel_sequence/_operation.py rename to colossalai/legacy/nn/layer/parallel_sequence/_operation.py index d03102527caa..fcf2962017a3 100644 --- a/colossalai/nn/layer/parallel_sequence/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py @@ -8,7 +8,7 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.legacy.communication import ring_forward -from colossalai.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range +from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range from colossalai.utils import get_current_device diff --git a/colossalai/nn/layer/parallel_sequence/_utils.py b/colossalai/legacy/nn/layer/parallel_sequence/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_sequence/_utils.py rename to colossalai/legacy/nn/layer/parallel_sequence/_utils.py diff --git a/colossalai/nn/layer/parallel_sequence/layers.py b/colossalai/legacy/nn/layer/parallel_sequence/layers.py similarity index 99% rename from colossalai/nn/layer/parallel_sequence/layers.py rename to colossalai/legacy/nn/layer/parallel_sequence/layers.py index 4d0ff2e0605b..e44e61c2fb7d 100644 --- a/colossalai/nn/layer/parallel_sequence/layers.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py @@ -14,8 +14,8 @@ from colossalai.core import global_context as gpc from colossalai.kernel import FusedScaleMaskSoftmax from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType +from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK from colossalai.legacy.registry import LAYERS -from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK @LAYERS.register_module diff --git a/colossalai/legacy/nn/layer/utils/__init__.py b/colossalai/legacy/nn/layer/utils/__init__.py new file mode 100644 index 000000000000..56e969bfd0bd --- /dev/null +++ b/colossalai/legacy/nn/layer/utils/__init__.py @@ -0,0 +1,15 @@ +from .common import ( + ACT2FN, + CheckpointModule, + _ntuple, + divide, + get_tensor_parallel_mode, + set_tensor_parallel_attribute_by_partition, + set_tensor_parallel_attribute_by_size, + to_2tuple, +) + +__all__ = [ + 'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size', + 'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple' +] diff --git a/colossalai/nn/layer/utils/common.py b/colossalai/legacy/nn/layer/utils/common.py similarity index 99% rename from colossalai/nn/layer/utils/common.py rename to colossalai/legacy/nn/layer/utils/common.py index f2297304fdc9..d8f3ad2a7eca 100644 --- a/colossalai/nn/layer/utils/common.py +++ b/colossalai/legacy/nn/layer/utils/common.py @@ -6,10 +6,11 @@ import numpy as np import torch +from torch import Tensor, nn + from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS from colossalai.global_variables import tensor_parallel_env as env from colossalai.utils import checkpoint -from torch import Tensor, nn class CheckpointModule(nn.Module): diff --git a/colossalai/nn/layer/vanilla/__init__.py b/colossalai/legacy/nn/layer/vanilla/__init__.py similarity index 100% rename from colossalai/nn/layer/vanilla/__init__.py rename to colossalai/legacy/nn/layer/vanilla/__init__.py diff --git a/colossalai/nn/layer/vanilla/layers.py b/colossalai/legacy/nn/layer/vanilla/layers.py similarity index 100% rename from colossalai/nn/layer/vanilla/layers.py rename to colossalai/legacy/nn/layer/vanilla/layers.py diff --git a/colossalai/nn/layer/wrapper/__init__.py b/colossalai/legacy/nn/layer/wrapper/__init__.py similarity index 100% rename from colossalai/nn/layer/wrapper/__init__.py rename to colossalai/legacy/nn/layer/wrapper/__init__.py diff --git a/colossalai/nn/layer/wrapper/pipeline_wrapper.py b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py similarity index 99% rename from colossalai/nn/layer/wrapper/pipeline_wrapper.py rename to colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py index ef1d794cc68f..68fea8622c5c 100644 --- a/colossalai/nn/layer/wrapper/pipeline_wrapper.py +++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py @@ -1,6 +1,8 @@ -import torch.nn as nn -import torch.distributed as dist from typing import List, Tuple, Union + +import torch.distributed as dist +import torch.nn as nn + from colossalai.context import ParallelMode from colossalai.core import global_context as gpc diff --git a/colossalai/legacy/nn/loss/__init__.py b/colossalai/legacy/nn/loss/__init__.py new file mode 100644 index 000000000000..1bd8872d9c3a --- /dev/null +++ b/colossalai/legacy/nn/loss/__init__.py @@ -0,0 +1,41 @@ +from torch import nn +from torch.nn.modules.loss import * +from torch.nn.modules.loss import _Loss + +from colossalai.global_variables import tensor_parallel_env as env +from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode + +from .loss_1d import VocabParallelCrossEntropyLoss1D +from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D +from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D +from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D + +_parallel_cross_entropy = { + '2d': CrossEntropyLoss2D, + '2.5d': CrossEntropyLoss2p5D, + '3d': CrossEntropyLoss3D, +} + +_vocab_parallel_cross_entropy = { + '1d': VocabParallelCrossEntropyLoss1D, + '2d': VocabParallelCrossEntropyLoss2D, + '2.5d': VocabParallelCrossEntropyLoss2p5D, + '3d': VocabParallelCrossEntropyLoss3D, +} + + +class CrossEntropyLoss(_Loss): + + def __init__(self, reduction: bool = True, *args, **kwargs): + super().__init__() + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel is not None and env.vocab_parallel: + self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) + elif tensor_parallel is None or tensor_parallel == '1d': + reduction = 'mean' if reduction else 'none' + self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs) + else: + self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) + + def forward(self, *args): + return self.loss(*args) diff --git a/colossalai/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py similarity index 100% rename from colossalai/nn/loss/loss_1d.py rename to colossalai/legacy/nn/loss/loss_1d.py diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py similarity index 97% rename from colossalai/nn/loss/loss_2d.py rename to colossalai/legacy/nn/loss/loss_2d.py index 6db40c0f3a04..6191602b71ee 100644 --- a/colossalai/nn/loss/loss_2d.py +++ b/colossalai/legacy/nn/loss/loss_2d.py @@ -6,9 +6,9 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d +from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization from colossalai.legacy.registry import LOSSES -from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d -from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization from colossalai.utils import get_current_device diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py similarity index 96% rename from colossalai/nn/loss/loss_2p5d.py rename to colossalai/legacy/nn/loss/loss_2p5d.py index 9c78a1ef0331..2746b201152c 100644 --- a/colossalai/nn/loss/loss_2p5d.py +++ b/colossalai/legacy/nn/loss/loss_2p5d.py @@ -6,9 +6,9 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d +from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization from colossalai.legacy.registry import LOSSES -from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d -from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization from colossalai.utils import get_current_device diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py similarity index 97% rename from colossalai/nn/loss/loss_3d.py rename to colossalai/legacy/nn/loss/loss_3d.py index 5c0f266401d1..2aeb1bd9825d 100644 --- a/colossalai/nn/loss/loss_3d.py +++ b/colossalai/legacy/nn/loss/loss_3d.py @@ -6,9 +6,9 @@ from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d +from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env from colossalai.legacy.registry import LOSSES -from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d -from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env from colossalai.utils import get_current_device diff --git a/colossalai/nn/metric/__init__.py b/colossalai/legacy/nn/metric/__init__.py similarity index 87% rename from colossalai/nn/metric/__init__.py rename to colossalai/legacy/nn/metric/__init__.py index 00833b6119c1..76c6dac89c5b 100644 --- a/colossalai/nn/metric/__init__.py +++ b/colossalai/legacy/nn/metric/__init__.py @@ -1,26 +1,28 @@ -from torch import nn - -from ._utils import calc_acc -from .accuracy_2d import Accuracy2D -from .accuracy_2p5d import Accuracy2p5D -from .accuracy_3d import Accuracy3D -from colossalai.nn.layer.utils import get_tensor_parallel_mode - -_parallel_accuracy = { - '2d': Accuracy2D, - '2.5d': Accuracy2p5D, - '3d': Accuracy3D, -} - - -class Accuracy(nn.Module): - def __init__(self): - super().__init__() - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel not in _parallel_accuracy: - self.acc = calc_acc - else: - self.acc = _parallel_accuracy[tensor_parallel]() - - def forward(self, *args): - return self.acc(*args) +from torch import nn + +from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode + +from ._utils import calc_acc +from .accuracy_2d import Accuracy2D +from .accuracy_2p5d import Accuracy2p5D +from .accuracy_3d import Accuracy3D + +_parallel_accuracy = { + '2d': Accuracy2D, + '2.5d': Accuracy2p5D, + '3d': Accuracy3D, +} + + +class Accuracy(nn.Module): + + def __init__(self): + super().__init__() + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel not in _parallel_accuracy: + self.acc = calc_acc + else: + self.acc = _parallel_accuracy[tensor_parallel]() + + def forward(self, *args): + return self.acc(*args) diff --git a/colossalai/nn/metric/_utils.py b/colossalai/legacy/nn/metric/_utils.py similarity index 95% rename from colossalai/nn/metric/_utils.py rename to colossalai/legacy/nn/metric/_utils.py index eac591b64c65..8706ffc101b0 100644 --- a/colossalai/nn/metric/_utils.py +++ b/colossalai/legacy/nn/metric/_utils.py @@ -1,7 +1,7 @@ -import torch - - -def calc_acc(logits, targets): - preds = torch.argmax(logits, dim=-1) - correct = torch.sum(targets == preds) - return correct +import torch + + +def calc_acc(logits, targets): + preds = torch.argmax(logits, dim=-1) + correct = torch.sum(targets == preds) + return correct diff --git a/colossalai/nn/metric/accuracy_2d.py b/colossalai/legacy/nn/metric/accuracy_2d.py similarity index 89% rename from colossalai/nn/metric/accuracy_2d.py rename to colossalai/legacy/nn/metric/accuracy_2d.py index a86832973cfd..838c48834a96 100644 --- a/colossalai/nn/metric/accuracy_2d.py +++ b/colossalai/legacy/nn/metric/accuracy_2d.py @@ -1,7 +1,8 @@ import torch -from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d from torch import nn +from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d + from ._utils import calc_acc diff --git a/colossalai/nn/metric/accuracy_2p5d.py b/colossalai/legacy/nn/metric/accuracy_2p5d.py similarity index 88% rename from colossalai/nn/metric/accuracy_2p5d.py rename to colossalai/legacy/nn/metric/accuracy_2p5d.py index 3044da065de1..183380cd9846 100644 --- a/colossalai/nn/metric/accuracy_2p5d.py +++ b/colossalai/legacy/nn/metric/accuracy_2p5d.py @@ -1,7 +1,8 @@ import torch -from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d from torch import nn +from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d + from ._utils import calc_acc diff --git a/colossalai/nn/metric/accuracy_3d.py b/colossalai/legacy/nn/metric/accuracy_3d.py similarity index 85% rename from colossalai/nn/metric/accuracy_3d.py rename to colossalai/legacy/nn/metric/accuracy_3d.py index 5506fc1d2ffc..1aaac73ecabd 100644 --- a/colossalai/nn/metric/accuracy_3d.py +++ b/colossalai/legacy/nn/metric/accuracy_3d.py @@ -1,33 +1,35 @@ -import torch -from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D -from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d -from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env -from torch import nn - -from ._utils import calc_acc - - -class Accuracy3D(nn.Module): - """Accuracy for 3D parallelism - """ - def __init__(self): - super().__init__() - self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) - self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) - - def forward(self, logits, targets): - """Calculate the accuracy of predicted labels. - - Args: - logits (:class:`torch.tensor`): Predicted labels. - targets (:class:`torch.tensor`): True labels from data. - - Returns: - float: the accuracy of prediction. - """ - with torch.no_grad(): - targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) - targets = split_tensor_3d(targets, 0, self.input_parallel_mode) - correct = calc_acc(logits, targets) - correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode) - return correct +import torch +from torch import nn + +from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D +from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d +from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env + +from ._utils import calc_acc + + +class Accuracy3D(nn.Module): + """Accuracy for 3D parallelism + """ + + def __init__(self): + super().__init__() + self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) + self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) + + def forward(self, logits, targets): + """Calculate the accuracy of predicted labels. + + Args: + logits (:class:`torch.tensor`): Predicted labels. + targets (:class:`torch.tensor`): True labels from data. + + Returns: + float: the accuracy of prediction. + """ + with torch.no_grad(): + targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) + targets = split_tensor_3d(targets, 0, self.input_parallel_mode) + correct = calc_acc(logits, targets) + correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode) + return correct diff --git a/colossalai/nn/parallel/__init__.py b/colossalai/legacy/nn/parallel/__init__.py similarity index 100% rename from colossalai/nn/parallel/__init__.py rename to colossalai/legacy/nn/parallel/__init__.py diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/legacy/nn/parallel/data_parallel.py similarity index 100% rename from colossalai/nn/parallel/data_parallel.py rename to colossalai/legacy/nn/parallel/data_parallel.py diff --git a/colossalai/nn/parallel/layers/__init__.py b/colossalai/legacy/nn/parallel/layers/__init__.py similarity index 56% rename from colossalai/nn/parallel/layers/__init__.py rename to colossalai/legacy/nn/parallel/layers/__init__.py index 29b8353e63c5..f38124efedf7 100644 --- a/colossalai/nn/parallel/layers/__init__.py +++ b/colossalai/legacy/nn/parallel/layers/__init__.py @@ -1,10 +1,17 @@ +from .cache_embedding import ( + CachedEmbeddingBag, + CachedParamMgr, + EvictionStrategy, + LimitBuffIndexCopyer, + ParallelCachedEmbeddingBag, + ParallelCachedEmbeddingBagTablewise, + ParallelCachedEmbeddingBagTablewiseSpiltCache, + TablewiseEmbeddingBagConfig, +) from .colo_module import ColoModule -from .linear import ColoLinear from .embedding import ColoEmbedding -from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module - -from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \ - ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache +from .linear import ColoLinear +from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module __all__ = [ 'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module', diff --git a/colossalai/nn/parallel/layers/cache_embedding/__init__.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py similarity index 100% rename from colossalai/nn/parallel/layers/cache_embedding/__init__.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py index 5bbc931a79dc..d87930c1c6b3 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/__init__.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py @@ -1,8 +1,8 @@ from .cache_mgr import CachedParamMgr, EvictionStrategy -from .copyer import LimitBuffIndexCopyer from .cached_embedding import CachedEmbeddingBag -from .parallel_cached_embedding import ParallelCachedEmbeddingBag +from .copyer import LimitBuffIndexCopyer from .embedding_config import TablewiseEmbeddingBagConfig +from .parallel_cached_embedding import ParallelCachedEmbeddingBag from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache diff --git a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/base_embedding.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py index 705835a0ed22..9558c541e703 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py @@ -1,4 +1,5 @@ import abc + import torch.nn as nn diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py index a6159856dcce..16530c4ce7b8 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py @@ -1,12 +1,14 @@ +import sys +from contextlib import contextmanager +from enum import Enum +from typing import List, Optional + import numpy as np import torch -from torch.profiler import record_function -from typing import List, Optional from contexttimer import Timer +from torch.profiler import record_function + from .copyer import LimitBuffIndexCopyer -from enum import Enum -import sys -from contextlib import contextmanager class EvictionStrategy(Enum): @@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None: class CachedParamMgr(torch.nn.Module): """ Manage Embedding Weights on CPU and CUDA memory uses a software cache. - CPU maintains the entire original weight. + CPU maintains the entire original weight. CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`. During training, GPU needs to transmit embedding rows between CPU and GPU. Args: @@ -115,7 +117,7 @@ def timer(self, name): self._elapsed_dict[name] += t.elapsed def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor: - """_find_evict_gpu_idxs + """_find_evict_gpu_idxs Find the gpu idxs to be evicted, according to their freq. Args: evict_num (int): how many rows has to be evicted @@ -202,7 +204,7 @@ def reorder(self, ids_freq_mapping: Optional[List[int]] = None, warmup_ratio=0.7 """reorder reorder the weight according to ids' frequency in dataset before training. Execute only once before training, also known as warmup phase. - + Note: If you would like to use the DATASET as the eviction strategy, you must call this function. Note: @@ -516,7 +518,7 @@ def _evict(self) -> int: """ deprecated evict one row from cuda to cpu. - Returns: + Returns: (int) : the slot id be evicted. """ mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1) diff --git a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py similarity index 98% rename from colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py index a74cb8d94bab..bc7d178906da 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py @@ -1,10 +1,11 @@ +from typing import Iterator, List, Optional, Tuple, Union + import torch import torch.nn.functional as F -from typing import List, Optional, Iterator, Tuple, Union +from torch.nn.parameter import Parameter from .base_embedding import BaseEmbeddingBag from .cache_mgr import CachedParamMgr, EvictionStrategy -from torch.nn.parameter import Parameter class CachedEmbeddingBag(BaseEmbeddingBag): @@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag): include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False. dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32. device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu. - cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row + cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None. warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7. buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0. @@ -85,10 +86,10 @@ def _preprocess(self, buffer_size=50_000, pin_weight=False): """ - Called after initialized. + Called after initialized. Reorder the weight rows according to the ids_freq_mapping. Then, let the weights of the Module be managed by a CachedParamMgr. - + Args: cuda_row_num (int): number of rows can be hosted in CUDA memory ids_freq_mapping (List[int]): a list, idx is id number, value is freq diff --git a/colossalai/nn/parallel/layers/cache_embedding/copyer.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py similarity index 97% rename from colossalai/nn/parallel/layers/cache_embedding/copyer.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py index aa1f794482f9..804a07f88207 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/copyer.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py @@ -3,7 +3,7 @@ class LimitBuffIndexCopyer(object): - """LimitBuffIndexCopyer + """LimitBuffIndexCopyer Index Copy using limited temp buffer on CUDA. Args: @@ -15,7 +15,7 @@ def __init__(self, size: int) -> None: @torch.no_grad() def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor): - """copy + """copy src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index] The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered. diff --git a/colossalai/nn/parallel/layers/cache_embedding/embedding_config.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py similarity index 100% rename from colossalai/nn/parallel/layers/cache_embedding/embedding_config.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py similarity index 96% rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py index d7f77e195f4b..79d7672b26bc 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py @@ -1,12 +1,13 @@ +from typing import Iterator, List, Optional, Tuple + import torch import torch.nn.functional as F -from typing import List, Optional, Iterator, Tuple -from .cached_embedding import CachedEmbeddingBag -from colossalai.nn._ops._utils import dual_all_to_all +from colossalai.legacy.nn._ops._utils import dual_all_to_all +from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec -from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor from .cache_mgr import CachedParamMgr, EvictionStrategy +from .cached_embedding import CachedEmbeddingBag def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]: diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py index 949f85ad4baf..116d836b7139 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py @@ -1,15 +1,16 @@ +import time +from typing import List + import torch import torch.distributed as dist import torch.nn.functional as F -from .cached_embedding import CachedEmbeddingBag -from .cache_mgr import EvictionStrategy -from .embedding_config import TablewiseEmbeddingBagConfig +from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise from colossalai.tensor import ProcessGroup -from colossalai.nn._ops._utils import dual_all_to_all_tablewise -from typing import List -import time +from .cache_mgr import EvictionStrategy +from .cached_embedding import CachedEmbeddingBag +from .embedding_config import TablewiseEmbeddingBagConfig class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag): diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py index 80a54b4fadd4..0014c784fba1 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py @@ -1,17 +1,17 @@ +import abc +from typing import List + import torch import torch.distributed as dist import torch.nn as nn from torch.profiler import record_function -from .cached_embedding import CachedEmbeddingBag - +from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise from colossalai.tensor import ProcessGroup -from colossalai.nn._ops._utils import dual_all_to_all_tablewise -from .embedding_config import TablewiseEmbeddingBagConfig -from .cache_mgr import EvictionStrategy -from typing import List -import abc +from .cache_mgr import EvictionStrategy +from .cached_embedding import CachedEmbeddingBag +from .embedding_config import TablewiseEmbeddingBagConfig class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module): diff --git a/colossalai/nn/parallel/layers/colo_module.py b/colossalai/legacy/nn/parallel/layers/colo_module.py similarity index 98% rename from colossalai/nn/parallel/layers/colo_module.py rename to colossalai/legacy/nn/parallel/layers/colo_module.py index 8f0f5d5f520a..a0a3eb40cf08 100644 --- a/colossalai/nn/parallel/layers/colo_module.py +++ b/colossalai/legacy/nn/parallel/layers/colo_module.py @@ -1,6 +1,7 @@ -from colossalai.tensor.distspec import _DistSpec +from typing import Dict, List + from colossalai.tensor import ComputePattern -from typing import List, Dict +from colossalai.tensor.distspec import _DistSpec class ColoModule(object): diff --git a/colossalai/nn/parallel/layers/embedding.py b/colossalai/legacy/nn/parallel/layers/embedding.py similarity index 92% rename from colossalai/nn/parallel/layers/embedding.py rename to colossalai/legacy/nn/parallel/layers/embedding.py index ccacc1ead297..3e4e7ffd8de7 100644 --- a/colossalai/nn/parallel/layers/embedding.py +++ b/colossalai/legacy/nn/parallel/layers/embedding.py @@ -1,5 +1,6 @@ +from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec + from .colo_module import ColoModule -from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec class ColoEmbedding(ColoModule): diff --git a/colossalai/nn/parallel/layers/linear.py b/colossalai/legacy/nn/parallel/layers/linear.py similarity index 93% rename from colossalai/nn/parallel/layers/linear.py rename to colossalai/legacy/nn/parallel/layers/linear.py index 84a8c042587d..e391cf808933 100644 --- a/colossalai/nn/parallel/layers/linear.py +++ b/colossalai/legacy/nn/parallel/layers/linear.py @@ -1,5 +1,6 @@ +from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec + from .colo_module import ColoModule -from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec class ColoLinear(ColoModule): diff --git a/colossalai/nn/parallel/layers/module_utils.py b/colossalai/legacy/nn/parallel/layers/module_utils.py similarity index 99% rename from colossalai/nn/parallel/layers/module_utils.py rename to colossalai/legacy/nn/parallel/layers/module_utils.py index 38d128cc705e..191266fa70fd 100644 --- a/colossalai/nn/parallel/layers/module_utils.py +++ b/colossalai/legacy/nn/parallel/layers/module_utils.py @@ -1,9 +1,11 @@ from typing import Dict -from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup -from colossalai.tensor import distspec -from . import ColoModule + import torch +from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec + +from . import ColoModule + _COLOSSAL_MODULES: Dict[type, ColoModule] = {} diff --git a/colossalai/nn/parallel/reducer.py b/colossalai/legacy/nn/parallel/reducer.py similarity index 100% rename from colossalai/nn/parallel/reducer.py rename to colossalai/legacy/nn/parallel/reducer.py diff --git a/colossalai/nn/__init__.py b/colossalai/nn/__init__.py index 910ad203180c..5ea46f7dd7bd 100644 --- a/colossalai/nn/__init__.py +++ b/colossalai/nn/__init__.py @@ -1,6 +1,4 @@ -from ._ops import * from .layer import * from .loss import * from .lr_scheduler import * -from .metric import * from .optimizer import * diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py index b705632f8040..edd986ef5e82 100644 --- a/colossalai/nn/layer/__init__.py +++ b/colossalai/nn/layer/__init__.py @@ -1,10 +1,2 @@ -from .colossalai_layer import * -from .parallel_1d import * -from .parallel_2d import * -from .parallel_2p5d import * -from .parallel_3d import * -from .parallel_sequence import * from .moe import * from .utils import * -from .vanilla import * -from .wrapper import * diff --git a/colossalai/nn/layer/parallel_1d/__init__.py b/colossalai/nn/layer/parallel_1d/__init__.py deleted file mode 100644 index 2353851df665..000000000000 --- a/colossalai/nn/layer/parallel_1d/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row, - PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D) - -__all__ = [ - 'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D', - 'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D' -] diff --git a/colossalai/nn/layer/utils.py b/colossalai/nn/layer/utils.py new file mode 100644 index 000000000000..dc12ff8daa4e --- /dev/null +++ b/colossalai/nn/layer/utils.py @@ -0,0 +1,14 @@ +def divide(numerator, denominator): + """Only allow exact division. + + Args: + numerator (int): Numerator of the division. + denominator (int): Denominator of the division. + + Returns: + int: the result of exact division. + """ + assert denominator != 0, 'denominator can not be zero' + assert numerator % denominator == 0, \ + '{} is not divisible by {}'.format(numerator, denominator) + return numerator // denominator diff --git a/colossalai/nn/layer/utils/__init__.py b/colossalai/nn/layer/utils/__init__.py deleted file mode 100644 index 7e999ee82149..000000000000 --- a/colossalai/nn/layer/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode, - set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple) - -__all__ = [ - 'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size', - 'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple' -] diff --git a/colossalai/nn/loss/__init__.py b/colossalai/nn/loss/__init__.py index 373e4ec9468b..ee2add48ab91 100644 --- a/colossalai/nn/loss/__init__.py +++ b/colossalai/nn/loss/__init__.py @@ -1,41 +1 @@ -from colossalai.global_variables import tensor_parallel_env as env -from colossalai.nn.layer.utils import get_tensor_parallel_mode -from torch import nn -from torch.nn.modules.loss import * -from torch.nn.modules.loss import _Loss - -from .loss_1d import VocabParallelCrossEntropyLoss1D -from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D -from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D -from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D from .loss_moe import MoeCrossEntropyLoss, MoeLoss - -_parallel_cross_entropy = { - '2d': CrossEntropyLoss2D, - '2.5d': CrossEntropyLoss2p5D, - '3d': CrossEntropyLoss3D, -} - -_vocab_parallel_cross_entropy = { - '1d': VocabParallelCrossEntropyLoss1D, - '2d': VocabParallelCrossEntropyLoss2D, - '2.5d': VocabParallelCrossEntropyLoss2p5D, - '3d': VocabParallelCrossEntropyLoss3D, -} - - -class CrossEntropyLoss(_Loss): - - def __init__(self, reduction: bool = True, *args, **kwargs): - super().__init__() - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel is not None and env.vocab_parallel: - self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) - elif tensor_parallel is None or tensor_parallel == '1d': - reduction = 'mean' if reduction else 'none' - self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs) - else: - self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) - - def forward(self, *args): - return self.loss(*args) diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/pipeline/pipelinable.py index 79913987b7cc..ba8b1591da9d 100644 --- a/colossalai/pipeline/pipelinable.py +++ b/colossalai/pipeline/pipelinable.py @@ -1,15 +1,24 @@ -import torch import inspect -from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses -from .utils import partition_uniform, partition_balanced, build_kwargs_for_function, \ - build_kwargs_for_module, exec_func_with_kwargs, exec_funcs_with_kwargs, \ - call_module, customized_partition -from colossalai.nn.layer.utils import CheckpointModule -from colossalai.tensor import ColoParameter -from colossalai.core import global_context as gpc +import torch + from colossalai.context import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.utils import CheckpointModule +from colossalai.tensor import ColoParameter +from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses + from .layer_spec import LayerSpec +from .utils import ( + build_kwargs_for_function, + build_kwargs_for_module, + call_module, + customized_partition, + exec_func_with_kwargs, + exec_funcs_with_kwargs, + partition_balanced, + partition_uniform, +) class PipelinableContext(InsertPostInitMethodToModuleSubClasses): diff --git a/colossalai/pipeline/utils.py b/colossalai/pipeline/utils.py index ac8a3ad7d1db..be8428692756 100644 --- a/colossalai/pipeline/utils.py +++ b/colossalai/pipeline/utils.py @@ -1,12 +1,13 @@ import heapq import inspect +from collections import OrderedDict +from typing import List + import torch +from colossalai.legacy.nn.layer.utils import CheckpointModule from colossalai.logging import get_dist_logger -from colossalai.nn.layer.utils import CheckpointModule -from typing import List -from collections import OrderedDict def _binary_partition(weights: List, start: int, end: int): """Returns the binary partition position of `weights`, given the start @@ -162,7 +163,7 @@ def build_kwargs_for_module(function, input_tensor, kw_dict): kwargs_offset = 1 elif isinstance(input_tensor, (tuple, OrderedDict)): #assert isinstance(input_tensor, tuple), f'input_tensor should be a torch.Tensor or a tuple object.' - # Huggingface will take their own structures based on OrderedDict as the output + # Huggingface will take their own structures based on OrderedDict as the output # between layers so we've to close this check. kwargs_offset = len(input_tensor) args_name_list = list(sig.parameters.keys()) @@ -256,7 +257,7 @@ def call_module(module, args=None, kwargs=None): def customized_partition(exec_seq): ''' - This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an + This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an annotation to note the partition point. ''' customized_parts = {} diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/tensor/dist_spec_mgr.py index c968050de49d..4740a316b7f5 100644 --- a/colossalai/tensor/dist_spec_mgr.py +++ b/colossalai/tensor/dist_spec_mgr.py @@ -2,7 +2,6 @@ import torch import torch.distributed as dist -# from colossalai.nn.layer.utils import divide from numpy import prod from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py index 7b2e8480c66c..6f9717d353e6 100644 --- a/colossalai/utils/__init__.py +++ b/colossalai/utils/__init__.py @@ -1,12 +1,14 @@ from .activation_checkpoint import checkpoint from .checkpointing import load_checkpoint, save_checkpoint from .common import ( + _cast_float, clip_grad_norm_fp32, conditional_context, copy_tensor_parallel_attributes, count_zeros_fp32, disposable, ensure_path_exists, + free_storage, is_ddp_ignored, is_dp_rank_0, is_model_parallel_parameter, @@ -72,4 +74,6 @@ 'disposable', 'colo_set_cpu_memory_capacity', 'colo_get_cpu_memory_capacity', + '_cast_float', + 'free_storage', ] diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py index 8022e84dc24b..998901708239 100644 --- a/colossalai/utils/common.py +++ b/colossalai/utils/common.py @@ -470,3 +470,22 @@ def wrapper(*args, **kwargs): return func(*args, **kwargs) return wrapper + + +def free_storage(data: torch.Tensor) -> None: + """Free underlying storage of a Tensor.""" + if data.storage().size() > 0: + # Since we're modifying the Tensor's Storage directly, make sure the Tensor + # is the sole occupant of the Storage. + assert data.storage_offset() == 0 + data.storage().resize_(0) + + +def _cast_float(args, dtype: torch.dtype): + if isinstance(args, torch.Tensor) and torch.is_floating_point(args): + args = args.to(dtype) + elif isinstance(args, (list, tuple)): + args = type(args)(_cast_float(t, dtype) for t in args) + elif isinstance(args, dict): + args = {k: _cast_float(v, dtype) for k, v in args.items()} + return args diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py index 75f8576ca477..dad852a34a71 100644 --- a/colossalai/zero/gemini/colo_init_context.py +++ b/colossalai/zero/gemini/colo_init_context.py @@ -87,7 +87,7 @@ def __init__(self, self._default_dist_spec = default_dist_spec def _register_colo_modules(self): - from colossalai.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module + from colossalai.legacy.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module register_colo_module(torch.nn.Linear, ColoLinear()) register_colo_module(torch.nn.Embedding, ColoEmbedding()) diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 741a977d1ea0..918b08cd3150 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -10,15 +10,13 @@ from torch.distributed import ProcessGroup from torch.distributed.distributed_c10d import _get_default_group -from colossalai.checkpoint_io.utils import calculate_tensor_size, StateDictSharder +from colossalai.checkpoint_io.utils import StateDictSharder, calculate_tensor_size from colossalai.interface import ModelWrapper - from colossalai.lazy import LazyTensor from colossalai.logging import get_dist_logger -from colossalai.nn.parallel.data_parallel import _cast_float, free_storage from colossalai.tensor.colo_parameter import ColoParameter from colossalai.tensor.param_op_hook import ColoParamOpHookManager -from colossalai.utils import get_current_device, is_ddp_ignored +from colossalai.utils import _cast_float, free_storage, get_current_device, is_ddp_ignored from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager from .gemini_hook import GeminiZeROHook @@ -780,5 +778,3 @@ def state_dict_shard(self, yield block, block_size yield sharder.current_block, sharder.current_block_size - - diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py index 0c9eac8b63e3..e5466965cc48 100644 --- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py +++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py @@ -1,7 +1,7 @@ import torch.nn -from colossalai.nn.parallel.data_parallel import _cast_float from colossalai.tensor.param_op_hook import ColoParamOpHookManager +from colossalai.utils import _cast_float from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import ( GradMemStats, GradMemTracerHook, diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md index 281fd47554ca..0a94a7f5d691 100644 --- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md +++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md @@ -176,7 +176,7 @@ In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overh ```python def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): - from colossalai.nn.parallel import GeminiDDP + from colossalai.zero import GeminiDDP model = GeminiDDP(model, device=get_current_device(), placement_policy=placement_policy, diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md index 5aa806c64322..36c94fb492cd 100644 --- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md @@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils.timer import MultiTimer from model_zoo.gpt import GPTLMLoss diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md index 22022639ce12..0ec9d5c3c5de 100644 --- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -78,7 +78,7 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.lr_scheduler import LinearWarmupLR -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks ``` diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md index 6d2355ad9044..e17c37e24a55 100644 --- a/docs/source/en/basics/engine_trainer.md +++ b/docs/source/en/basics/engine_trainer.md @@ -344,7 +344,7 @@ for epoch in range(gpc.config.NUM_EPOCHS): If you wish to train with a trainer object, you can follow the code snippet below: ```python -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md index 3f85d50454ae..dfd1e2910b4e 100644 --- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md +++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md @@ -160,7 +160,7 @@ for mn, module in model.named_modules(): ```python def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): - from colossalai.nn.parallel import GeminiDDP + from colossalai.zero import GeminiDDP model = GeminiDDP(model, device=get_current_device(), placement_policy=placement_policy, diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md index 9cfbf58731b8..3f57f39f2838 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md @@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils.timer import MultiTimer from model_zoo.gpt import GPTLMLoss diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md index 803882a5ad2e..f7dd8d477a66 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -73,7 +73,7 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.lr_scheduler import LinearWarmupLR -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks ``` diff --git a/docs/source/zh-Hans/basics/engine_trainer.md b/docs/source/zh-Hans/basics/engine_trainer.md index e57220292c98..ed5100299212 100644 --- a/docs/source/zh-Hans/basics/engine_trainer.md +++ b/docs/source/zh-Hans/basics/engine_trainer.md @@ -340,7 +340,7 @@ for epoch in range(gpc.config.NUM_EPOCHS): ```python -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py index 668992901239..e521193a97da 100644 --- a/examples/language/gpt/titans/model/embed.py +++ b/examples/language/gpt/titans/model/embed.py @@ -8,11 +8,11 @@ from colossalai.context import ParallelMode, seed from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.base_layer import ParallelLayer +from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input +from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row +from colossalai.legacy.nn.layer.utils import divide from colossalai.legacy.registry import LAYERS, LOSSES, MODELS -from colossalai.nn.layer.base_layer import ParallelLayer -from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input -from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row -from colossalai.nn.layer.utils import divide from colossalai.utils import get_current_device diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py index 2edd03606b7d..72297c540da1 100644 --- a/examples/language/gpt/titans/model/gpt1d.py +++ b/examples/language/gpt/titans/model/gpt1d.py @@ -11,9 +11,9 @@ from colossalai import nn as col_nn from colossalai.core import global_context as gpc from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType -from colossalai.nn.layer import Linear1D_Col, Linear1D_Row -from colossalai.nn.layer.base_layer import ParallelLayer -from colossalai.nn.layer.utils import ACT2FN, divide +from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row +from colossalai.legacy.nn.layer.base_layer import ParallelLayer +from colossalai.legacy.nn.layer.utils import ACT2FN, divide from colossalai.utils import checkpoint from colossalai.utils.activation_checkpoint import checkpoint diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py index 30180285bc70..9b22d156bbcd 100644 --- a/examples/language/gpt/titans/model/pipeline_gpt1d.py +++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py @@ -9,8 +9,8 @@ from colossalai import nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.logging import get_dist_logger -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.pipeline.utils import partition_uniform from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py index 4953d5350f31..12cdec902400 100644 --- a/examples/tutorial/hybrid_parallel/train.py +++ b/examples/tutorial/hybrid_parallel/train.py @@ -7,8 +7,8 @@ import colossalai from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn import CrossEntropyLoss from colossalai.logging import get_dist_logger -from colossalai.nn import CrossEntropyLoss from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.pipeline.pipelinable import PipelinableContext from colossalai.utils import is_using_pp diff --git a/examples/tutorial/sequence_parallel/model/bert.py b/examples/tutorial/sequence_parallel/model/bert.py index 049579c5a639..b8adb501f95e 100644 --- a/examples/tutorial/sequence_parallel/model/bert.py +++ b/examples/tutorial/sequence_parallel/model/bert.py @@ -1,33 +1,37 @@ -from colossalai.context.parallel_mode import ParallelMode +import inspect + import torch import torch.nn as nn -import inspect -from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding -from .layers.init_method import init_normal, output_init_normal -from colossalai.core import global_context as gpc + from colossalai.context import ParallelMode +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc from colossalai.kernel import LayerNorm -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.logging import get_dist_logger from colossalai.pipeline.utils import partition_uniform +from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding +from .layers.init_method import init_normal, output_init_normal + class BertForPretrain(nn.Module): - def __init__(self, - vocab_size, - hidden_size, - max_sequence_length, - num_attention_heads, - num_layers, - add_binary_head, - is_naive_fp16, - num_tokentypes=2, - dropout_prob=0.1, - mlp_ratio=4, - init_std=0.02, - convert_fp16_to_fp32_in_softmax=False, - ): + def __init__( + self, + vocab_size, + hidden_size, + max_sequence_length, + num_attention_heads, + num_layers, + add_binary_head, + is_naive_fp16, + num_tokentypes=2, + dropout_prob=0.1, + mlp_ratio=4, + init_std=0.02, + convert_fp16_to_fp32_in_softmax=False, + ): super().__init__() self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE) assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size' @@ -47,19 +51,19 @@ def __init__(self, self.bert_layers = nn.ModuleList() for i in range(num_layers): - bert_layer = BertLayer(layer_number=i+1, + bert_layer = BertLayer(layer_number=i + 1, hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_dropout=dropout_prob, mlp_ratio=mlp_ratio, hidden_dropout=dropout_prob, convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax, - is_naive_fp16=is_naive_fp16 - ) + is_naive_fp16=is_naive_fp16) self.bert_layers.append(bert_layer) self.layer_norm = LayerNorm(hidden_size) - self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0), + self.head = BertDualHead(hidden_size, + self.embedding.word_embedding_weight.size(0), add_binary_head=add_binary_head) self.reset_parameters() @@ -166,22 +170,20 @@ def __init__(self, end_idx = num_layers for i in range(start_idx, end_idx): - bert_layer = BertLayer(layer_number=i+1, + bert_layer = BertLayer(layer_number=i + 1, hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_dropout=dropout_prob, mlp_ratio=mlp_ratio, hidden_dropout=dropout_prob, convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax, - is_naive_fp16=is_naive_fp16 - ) + is_naive_fp16=is_naive_fp16) self.bert_layers.append(bert_layer) if self.last_stage: self.word_embeddings = VocabEmbedding(vocab_size, hidden_size) self.layer_norm = LayerNorm(hidden_size) - self.head = BertDualHead(hidden_size, vocab_size, - add_binary_head=add_binary_head) + self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head) self.reset_parameters() def _init_normal(self, tensor): diff --git a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py index 4ede21516f65..56ba511d8274 100644 --- a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py +++ b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py @@ -1,10 +1,12 @@ import torch import torch.nn as nn -from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing -from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference + from colossalai.kernel.cuda_native import LayerNorm -from .mlp import TransformerMLP +from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train +from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing + from .dropout import get_bias_dropout_add +from .mlp import TransformerMLP def attention_mask_func(attention_scores, attention_mask): @@ -48,8 +50,7 @@ def __init__(self, layer_number=layer_number, apply_query_key_layer_scaling=True, convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax, - fp16=is_naive_fp16 - ) + fp16=is_naive_fp16) self.hidden_dropout = hidden_dropout self.bias_dropout_fusion = bias_dropout_fusion @@ -89,11 +90,8 @@ def forward(self, hidden_states, attention_mask): # re-enable torch grad to enable fused optimization. with torch.enable_grad(): - layernorm_input = bias_dropout_add_func( - attention_output, - attention_bias.expand_as(residual), - residual, - self.hidden_dropout) + layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual, + self.hidden_dropout) # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) @@ -109,10 +107,6 @@ def forward(self, hidden_states, attention_mask): # re-enable torch grad to enable fused optimization. with torch.enable_grad(): - output = bias_dropout_add_func( - mlp_output, - mlp_bias.expand_as(residual), - residual, - self.hidden_dropout) + output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout) return output diff --git a/tests/components_to_test/hanging_param_model.py b/tests/components_to_test/hanging_param_model.py index 329a08ea28f0..0e65431217c7 100644 --- a/tests/components_to_test/hanging_param_model.py +++ b/tests/components_to_test/hanging_param_model.py @@ -2,7 +2,7 @@ import torch.nn as nn import torch.nn.functional as F -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils.dummy_data_generator import DummyDataGenerator diff --git a/tests/components_to_test/inline_op_model.py b/tests/components_to_test/inline_op_model.py index f061d48f92c6..80757f361d9e 100644 --- a/tests/components_to_test/inline_op_model.py +++ b/tests/components_to_test/inline_op_model.py @@ -2,7 +2,7 @@ import torch.nn as nn import torch.nn.functional as F -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils.dummy_data_generator import DummyDataGenerator diff --git a/tests/components_to_test/nested_model.py b/tests/components_to_test/nested_model.py index 339084639244..3e779b0a6428 100644 --- a/tests/components_to_test/nested_model.py +++ b/tests/components_to_test/nested_model.py @@ -2,7 +2,7 @@ import torch.nn as nn import torch.nn.functional as F -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils import DummyDataGenerator diff --git a/tests/components_to_test/repeated_computed_layers.py b/tests/components_to_test/repeated_computed_layers.py index b3f84bd0e203..c1ef99aa07b4 100644 --- a/tests/components_to_test/repeated_computed_layers.py +++ b/tests/components_to_test/repeated_computed_layers.py @@ -3,7 +3,7 @@ import torch import torch.nn as nn -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils.dummy_data_generator import DummyDataGenerator diff --git a/tests/components_to_test/simple_net.py b/tests/components_to_test/simple_net.py index cd9d7ebc0b1a..064974a15a97 100644 --- a/tests/components_to_test/simple_net.py +++ b/tests/components_to_test/simple_net.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from colossalai.utils.cuda import get_current_device from .registry import non_distributed_component_funcs diff --git a/tests/test_layers/test_1d/checks_1d/__init__.py b/tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py similarity index 100% rename from tests/test_layers/test_1d/checks_1d/__init__.py rename to tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py diff --git a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py similarity index 99% rename from tests/test_layers/test_1d/checks_1d/check_layer_1d.py rename to tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py index 668b8a334800..dcb2be62671b 100644 --- a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py +++ b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py @@ -5,7 +5,7 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env -from colossalai.nn import ( +from colossalai.legacy.nn import ( Classifier1D, Embedding1D, Linear1D_Col, diff --git a/tests/test_layers/test_1d/checks_1d/common.py b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py similarity index 94% rename from tests/test_layers/test_1d/checks_1d/common.py rename to tests/test_legacy/test_layers/test_1d/checks_1d/common.py index 8b7b28613d22..29a9a3d20330 100644 --- a/tests/test_layers/test_1d/checks_1d/common.py +++ b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py @@ -1,15 +1,16 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import torch - -DEPTH = 4 -BATCH_SIZE = 8 -SEQ_LENGTH = 8 -IMG_SIZE = 16 -HIDDEN_SIZE = 8 -NUM_CLASSES = 8 -VOCAB_SIZE = 16 - -def check_equal(A, B): - assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import torch + +DEPTH = 4 +BATCH_SIZE = 8 +SEQ_LENGTH = 8 +IMG_SIZE = 16 +HIDDEN_SIZE = 8 +NUM_CLASSES = 8 +VOCAB_SIZE = 16 + + +def check_equal(A, B): + assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True diff --git a/tests/test_layers/test_1d/test_1d.py b/tests/test_legacy/test_layers/test_1d/test_1d.py similarity index 100% rename from tests/test_layers/test_1d/test_1d.py rename to tests/test_legacy/test_layers/test_1d/test_1d.py diff --git a/tests/test_layers/test_2d/checks_2d/__init__.py b/tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py similarity index 100% rename from tests/test_layers/test_2d/checks_2d/__init__.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py diff --git a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py similarity index 97% rename from tests/test_layers/test_2d/checks_2d/check_layer_2d.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py index e030e473a363..0ee88c26035f 100644 --- a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py +++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py @@ -1,12 +1,23 @@ import torch + from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn import (Classifier2D, CrossEntropyLoss2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, - VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2D, - VocabParallelCrossEntropyLoss2D, VocabParallelEmbedding2D) +from colossalai.legacy.nn import ( + Classifier2D, + CrossEntropyLoss2D, + Embedding2D, + LayerNorm2D, + Linear2D, + PatchEmbedding2D, + VanillaClassifier, + VanillaPatchEmbedding, + VocabParallelClassifier2D, + VocabParallelCrossEntropyLoss2D, + VocabParallelEmbedding2D, +) from colossalai.utils import get_current_device, print_rank_0 -from .common import (BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal) +from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal def check_linear(): @@ -336,7 +347,7 @@ def check_classifier_no_given_weight(): layer.weight.data.copy_(W) # W.requires_grad = True - B_shape = (OUTPUT_SIZE, ) + B_shape = (OUTPUT_SIZE,) B_master = torch.randint(5, B_shape, dtype=dtype, device=device) torch.distributed.broadcast(B_master, src=0) # B = torch.chunk(B_master, DEPTH, dim=0)[j] @@ -572,7 +583,7 @@ def check_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, DEPTH, dim=0)[i] @@ -607,7 +618,7 @@ def check_vocab_parallel_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, DEPTH, dim=0)[i] diff --git a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py similarity index 96% rename from tests/test_layers/test_2d/checks_2d/check_operation_2d.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py index a5e37b1ec309..ae1d1120cfb9 100644 --- a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py +++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py @@ -5,10 +5,10 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D -from colossalai.utils import get_current_device -from colossalai.utils import print_rank_0 -from .common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH +from colossalai.legacy.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D +from colossalai.utils import get_current_device, print_rank_0 + +from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, SEQ_LENGTH, check_equal def check_AB(): diff --git a/tests/test_layers/test_2d/checks_2d/common.py b/tests/test_legacy/test_layers/test_2d/checks_2d/common.py similarity index 100% rename from tests/test_layers/test_2d/checks_2d/common.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/common.py diff --git a/tests/test_layers/test_2d/test_2d.py b/tests/test_legacy/test_layers/test_2d/test_2d.py similarity index 100% rename from tests/test_layers/test_2d/test_2d.py rename to tests/test_legacy/test_layers/test_2d/test_2d.py diff --git a/tests/test_layers/test_2p5d/checks_2p5d/__init__.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py similarity index 100% rename from tests/test_layers/test_2p5d/checks_2p5d/__init__.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py similarity index 98% rename from tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py index a8f551093b1e..5a99b05cfe7e 100644 --- a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py +++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py @@ -1,11 +1,22 @@ import torch +from torch.nn import Parameter + from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn import (Classifier2p5D, CrossEntropyLoss2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, - PatchEmbedding2p5D, VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2p5D, - VocabParallelCrossEntropyLoss2p5D, VocabParallelEmbedding2p5D) +from colossalai.legacy.nn import ( + Classifier2p5D, + CrossEntropyLoss2p5D, + Embedding2p5D, + LayerNorm2p5D, + Linear2p5D, + PatchEmbedding2p5D, + VanillaClassifier, + VanillaPatchEmbedding, + VocabParallelClassifier2p5D, + VocabParallelCrossEntropyLoss2p5D, + VocabParallelEmbedding2p5D, +) from colossalai.utils import get_current_device, print_rank_0 -from torch.nn import Parameter from .common import * @@ -342,7 +353,7 @@ def check_classifier_no_given_weight(): layer.weight.data.copy_(W) # W.requires_grad = True - B_shape = (OUTPUT_SIZE, ) + B_shape = (OUTPUT_SIZE,) B_master = torch.randint(5, B_shape, dtype=dtype, device=device) torch.distributed.broadcast(B_master, src=0) # B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[j] @@ -577,7 +588,7 @@ def check_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i] @@ -612,7 +623,7 @@ def check_vocab_parallel_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i] diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py similarity index 97% rename from tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py index d0c3b02fccba..db19967676d2 100644 --- a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py +++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py @@ -2,10 +2,9 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, \ - Matmul_ATB_2p5D -from colossalai.utils import get_current_device -from colossalai.utils import print_rank_0 +from colossalai.legacy.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, Matmul_ATB_2p5D +from colossalai.utils import get_current_device, print_rank_0 + from .common import * diff --git a/tests/test_layers/test_2p5d/checks_2p5d/common.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py similarity index 75% rename from tests/test_layers/test_2p5d/checks_2p5d/common.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py index aff85f109666..c90d8fc086bd 100644 --- a/tests/test_layers/test_2p5d/checks_2p5d/common.py +++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py @@ -11,4 +11,4 @@ def check_equal(A, B): - assert torch.allclose(A, B, rtol=1e-5, atol=1e-2) \ No newline at end of file + assert torch.allclose(A, B, rtol=1e-5, atol=1e-2) diff --git a/tests/test_layers/test_2p5d/test_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py similarity index 100% rename from tests/test_layers/test_2p5d/test_2p5d.py rename to tests/test_legacy/test_layers/test_2p5d/test_2p5d.py diff --git a/tests/test_layers/test_3d/checks_3d/__init__.py b/tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py similarity index 100% rename from tests/test_layers/test_3d/checks_3d/__init__.py rename to tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py diff --git a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py similarity index 99% rename from tests/test_layers/test_3d/checks_3d/check_layer_3d.py rename to tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py index e946a1f5912d..cee639a9f00a 100644 --- a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py +++ b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py @@ -7,8 +7,7 @@ from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.core import global_context -from colossalai.logging import get_dist_logger -from colossalai.nn import ( +from colossalai.legacy.nn import ( Classifier3D, CrossEntropyLoss3D, Embedding3D, @@ -21,7 +20,8 @@ VocabParallelCrossEntropyLoss3D, VocabParallelEmbedding3D, ) -from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env +from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env +from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device, print_rank_0 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal diff --git a/tests/test_layers/test_3d/checks_3d/common.py b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py similarity index 95% rename from tests/test_layers/test_3d/checks_3d/common.py rename to tests/test_legacy/test_layers/test_3d/checks_3d/common.py index afb19c4745cc..509fc2cecf59 100644 --- a/tests/test_layers/test_3d/checks_3d/common.py +++ b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py @@ -16,4 +16,4 @@ def check_equal(A, B): eq = torch.allclose(A, B, rtol=1e-3, atol=1e-2) assert eq, f"\nA = {A}\nB = {B}" - return eq \ No newline at end of file + return eq diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_legacy/test_layers/test_3d/test_3d.py similarity index 100% rename from tests/test_layers/test_3d/test_3d.py rename to tests/test_legacy/test_layers/test_3d/test_3d.py diff --git a/tests/test_layers/test_cache_embedding.py b/tests/test_legacy/test_layers/test_cache_embedding.py similarity index 99% rename from tests/test_layers/test_cache_embedding.py rename to tests/test_legacy/test_layers/test_cache_embedding.py index 22d4f02a48d7..0760a3f1ec38 100644 --- a/tests/test_layers/test_cache_embedding.py +++ b/tests/test_legacy/test_layers/test_cache_embedding.py @@ -6,7 +6,7 @@ import torch import colossalai -from colossalai.nn.parallel.layers import ( +from colossalai.legacy.nn.parallel.layers import ( CachedEmbeddingBag, CachedParamMgr, EvictionStrategy, diff --git a/tests/test_layers/test_sequence/checks_seq/__init__.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py similarity index 100% rename from tests/test_layers/test_sequence/checks_seq/__init__.py rename to tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py diff --git a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py similarity index 91% rename from tests/test_layers/test_sequence/checks_seq/check_layer_seq.py rename to tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py index 2b7b999d4373..7ff91a7b76e0 100644 --- a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py +++ b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py @@ -2,7 +2,7 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn import TransformerSelfAttentionRing +from colossalai.legacy.nn import TransformerSelfAttentionRing from colossalai.utils import get_current_device diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_legacy/test_layers/test_sequence/test_sequence.py similarity index 97% rename from tests/test_layers/test_sequence/test_sequence.py rename to tests/test_legacy/test_layers/test_sequence/test_sequence.py index 60f2d55f43af..b9e6c12479ee 100644 --- a/tests/test_layers/test_sequence/test_sequence.py +++ b/tests/test_legacy/test_layers/test_sequence/test_sequence.py @@ -5,6 +5,7 @@ import colossalai from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_sequence import RingAV, RingQK from colossalai.testing import rerun_if_address_is_in_use, spawn CONFIG = dict(parallel=dict(tensor=dict(size=4, mode='sequence'))) @@ -42,7 +43,7 @@ def check_ring_qk(rank, world_size): a = torch.matmul(q, k.transpose(2, 1)) # compute distributed attention scores - ring_qk = colossalai.nn.layer.parallel_sequence.RingQK.apply + ring_qk = RingQK.apply sub_a = ring_qk(sub_q, sub_k, batch_size, num_heads, sub_seq_length) # check master and distributed attention scores @@ -95,7 +96,7 @@ def check_ring_av(rank, world_size): out = torch.matmul(a, v) # compute distributed attention scores - ring_av = colossalai.nn.layer.parallel_sequence.RingAV.apply + ring_av = RingAV.apply sub_out = ring_av(sub_a, sub_v, batch_size, num_heads, attention_head_size, sub_seq_length) # print(f'master output shape: {out.shape}, partial output shape: {sub_out.shape}') diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py index 335be61359ed..9c3a7e2161d2 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py index 175d9ef6ceb9..03b2e4f2a9b2 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py index 33cb3a65d184..cafffd0a6202 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py index 73ac2dd5fe18..9b43be9e8cc5 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch