From 89fe0277875146cc521f1e15e508efd43e56f34c Mon Sep 17 00:00:00 2001 From: Hongxin Liu Date: Thu, 31 Aug 2023 13:51:28 +0800 Subject: [PATCH] [legacy] move trainer to legacy (#4545) * [legacy] move trainer to legacy * [doc] update docs related to trainer * [test] ignore legacy test --- colossalai/legacy/__init__.py | 0 colossalai/{ => legacy}/trainer/__init__.py | 0 colossalai/{ => legacy}/trainer/_trainer.py | 7 +- .../{ => legacy}/trainer/hooks/__init__.py | 9 +- .../{ => legacy}/trainer/hooks/_base_hook.py | 0 .../trainer/hooks/_checkpoint_hook.py | 5 +- .../{ => legacy}/trainer/hooks/_commons_.py | 0 .../{ => legacy}/trainer/hooks/_log_hook.py | 10 +- .../trainer/hooks/_lr_scheduler_hook.py | 3 +- .../trainer/hooks/_metric_hook.py | 11 +- .../train_gpt_using_hybrid_parallelism.md | 3 +- .../train_vit_using_pipeline_parallelism.md | 3 +- .../train_vit_with_hybrid_parallelism.md | 3 +- docs/source/en/basics/engine_trainer.md | 7 +- docs/source/en/basics/model_checkpoint.md | 3 +- .../en/features/mixed_precision_training.md | 2 +- docs/source/en/features/pipeline_parallel.md | 3 +- .../train_gpt_using_hybrid_parallelism.md | 3 +- .../train_vit_using_pipeline_parallelism.md | 3 +- .../train_vit_with_hybrid_parallelism.md | 3 +- docs/source/zh-Hans/basics/engine_trainer.md | 7 +- .../source/zh-Hans/basics/model_checkpoint.md | 3 +- .../features/mixed_precision_training.md | 2 +- .../zh-Hans/features/pipeline_parallel.md | 3 +- examples/language/gpt/titans/train_gpt.py | 2 +- pytest.ini | 2 +- .../test_cifar_with_data_pipeline_tensor.py | 100 ------------------ .../test_trainer/test_pipeline/test_p2p.py | 0 .../test_pipeline/test_pipeline_schedule.py | 0 .../test_trainer_with_non_pipe_schedule.py | 2 +- .../test_trainer_with_pipe_schedule.py | 2 +- .../test_cuda_rpc_performance.py | 15 +-- 32 files changed, 63 insertions(+), 153 deletions(-) create mode 100644 colossalai/legacy/__init__.py rename colossalai/{ => legacy}/trainer/__init__.py (100%) rename colossalai/{ => legacy}/trainer/_trainer.py (98%) rename colossalai/{ => legacy}/trainer/hooks/__init__.py (75%) rename colossalai/{ => legacy}/trainer/hooks/_base_hook.py (100%) rename colossalai/{ => legacy}/trainer/hooks/_checkpoint_hook.py (98%) rename colossalai/{ => legacy}/trainer/hooks/_commons_.py (100%) rename colossalai/{ => legacy}/trainer/hooks/_log_hook.py (98%) rename colossalai/{ => legacy}/trainer/hooks/_lr_scheduler_hook.py (99%) rename colossalai/{ => legacy}/trainer/hooks/_metric_hook.py (98%) delete mode 100644 tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py rename tests/{ => test_legacy}/test_trainer/test_pipeline/test_p2p.py (100%) rename tests/{ => test_legacy}/test_trainer/test_pipeline/test_pipeline_schedule.py (100%) rename tests/{ => test_legacy}/test_trainer/test_trainer_with_non_pipe_schedule.py (97%) rename tests/{ => test_legacy}/test_trainer/test_trainer_with_pipe_schedule.py (98%) diff --git a/colossalai/legacy/__init__.py b/colossalai/legacy/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/colossalai/trainer/__init__.py b/colossalai/legacy/trainer/__init__.py similarity index 100% rename from colossalai/trainer/__init__.py rename to colossalai/legacy/trainer/__init__.py diff --git a/colossalai/trainer/_trainer.py b/colossalai/legacy/trainer/_trainer.py similarity index 98% rename from colossalai/trainer/_trainer.py rename to colossalai/legacy/trainer/_trainer.py index bfe1c403fd48..fb66acec5f25 100644 --- a/colossalai/trainer/_trainer.py +++ b/colossalai/legacy/trainer/_trainer.py @@ -1,14 +1,13 @@ -from typing import Union, List, Any +from typing import Any, List, Union import torch from torch.utils.data import DataLoader from tqdm import tqdm from colossalai.engine import Engine +from colossalai.legacy.trainer.hooks import BaseHook from colossalai.logging import DistributedLogger -from colossalai.utils import MultiTimer -from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage -from colossalai.trainer.hooks import BaseHook +from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0 class Trainer: diff --git a/colossalai/trainer/hooks/__init__.py b/colossalai/legacy/trainer/hooks/__init__.py similarity index 75% rename from colossalai/trainer/hooks/__init__.py rename to colossalai/legacy/trainer/hooks/__init__.py index 4d36093833d9..bf9cc6421b67 100644 --- a/colossalai/trainer/hooks/__init__.py +++ b/colossalai/legacy/trainer/hooks/__init__.py @@ -1,7 +1,12 @@ from ._base_hook import BaseHook from ._checkpoint_hook import SaveCheckpointHook -from ._log_hook import (LogMemoryByEpochHook, LogMetricByEpochHook, LogMetricByStepHook, LogTimingByEpochHook, - TensorboardHook) +from ._log_hook import ( + LogMemoryByEpochHook, + LogMetricByEpochHook, + LogMetricByStepHook, + LogTimingByEpochHook, + TensorboardHook, +) from ._lr_scheduler_hook import LRSchedulerHook from ._metric_hook import AccuracyHook, LossHook, MetricHook, ThroughputHook diff --git a/colossalai/trainer/hooks/_base_hook.py b/colossalai/legacy/trainer/hooks/_base_hook.py similarity index 100% rename from colossalai/trainer/hooks/_base_hook.py rename to colossalai/legacy/trainer/hooks/_base_hook.py diff --git a/colossalai/trainer/hooks/_checkpoint_hook.py b/colossalai/legacy/trainer/hooks/_checkpoint_hook.py similarity index 98% rename from colossalai/trainer/hooks/_checkpoint_hook.py rename to colossalai/legacy/trainer/hooks/_checkpoint_hook.py index 3bcb32cd2dcb..7754ebcc3bcc 100644 --- a/colossalai/trainer/hooks/_checkpoint_hook.py +++ b/colossalai/legacy/trainer/hooks/_checkpoint_hook.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- import torch -from colossalai.logging import get_dist_logger +from colossalai.legacy.trainer.hooks import BaseHook +from colossalai.logging import get_dist_logger from colossalai.registry import HOOKS -from colossalai.trainer.hooks import BaseHook from colossalai.utils.checkpointing import save_checkpoint + from ._lr_scheduler_hook import LRSchedulerHook diff --git a/colossalai/trainer/hooks/_commons_.py b/colossalai/legacy/trainer/hooks/_commons_.py similarity index 100% rename from colossalai/trainer/hooks/_commons_.py rename to colossalai/legacy/trainer/hooks/_commons_.py diff --git a/colossalai/trainer/hooks/_log_hook.py b/colossalai/legacy/trainer/hooks/_log_hook.py similarity index 98% rename from colossalai/trainer/hooks/_log_hook.py rename to colossalai/legacy/trainer/hooks/_log_hook.py index 5b1f33983422..1efc8be7644f 100644 --- a/colossalai/trainer/hooks/_log_hook.py +++ b/colossalai/legacy/trainer/hooks/_log_hook.py @@ -3,17 +3,17 @@ import os import os.path as osp - from typing import List + from colossalai.context import ParallelMode from colossalai.core import global_context as gpc -from colossalai.registry import HOOKS +from colossalai.legacy.trainer.hooks._metric_hook import ThroughputMetric from colossalai.logging import DistributedLogger -from colossalai.utils import report_memory_usage, is_dp_rank_0, \ - is_tp_rank_0, is_no_pp_or_last_stage, MultiTimer +from colossalai.registry import HOOKS +from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage + from ._base_hook import BaseHook from ._commons_ import _format_number -from colossalai.trainer.hooks._metric_hook import ThroughputMetric class LogByEpochHook(BaseHook): diff --git a/colossalai/trainer/hooks/_lr_scheduler_hook.py b/colossalai/legacy/trainer/hooks/_lr_scheduler_hook.py similarity index 99% rename from colossalai/trainer/hooks/_lr_scheduler_hook.py rename to colossalai/legacy/trainer/hooks/_lr_scheduler_hook.py index c6da33442dc3..0d19ab08a822 100644 --- a/colossalai/trainer/hooks/_lr_scheduler_hook.py +++ b/colossalai/legacy/trainer/hooks/_lr_scheduler_hook.py @@ -1,6 +1,7 @@ -from colossalai.registry import HOOKS from torch import Tensor +from colossalai.registry import HOOKS + from ._metric_hook import LearningRateMetric, MetricHook diff --git a/colossalai/trainer/hooks/_metric_hook.py b/colossalai/legacy/trainer/hooks/_metric_hook.py similarity index 98% rename from colossalai/trainer/hooks/_metric_hook.py rename to colossalai/legacy/trainer/hooks/_metric_hook.py index 526d6c746ec6..96def4172fed 100644 --- a/colossalai/trainer/hooks/_metric_hook.py +++ b/colossalai/legacy/trainer/hooks/_metric_hook.py @@ -6,6 +6,7 @@ import torch import torch.distributed as dist + from colossalai.communication import all_reduce from colossalai.context import ParallelMode from colossalai.core import global_context as gpc @@ -19,8 +20,8 @@ class Metric(ABC): """A basic class of metric collectors. It collects a specific metric during training or evaluation and would always be used with - :class:`MetricHook` to help it update its states and show the - metric. So please use corresponding hook class to make the metric + :class:`MetricHook` to help it update its states and show the + metric. So please use corresponding hook class to make the metric collector works. Args: @@ -220,9 +221,9 @@ def is_better(a, b) -> bool: class MetricHook(BaseHook): - """Specialized hook classes for :class:`Metric`. - Some help metric collectors initialize, reset and - update their states. Others are used to display and + """Specialized hook classes for :class:`Metric`. + Some help metric collectors initialize, reset and + update their states. Others are used to display and record the metric. Args: diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md index 715c15eb6300..24aa2610faea 100644 --- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md @@ -43,7 +43,7 @@ from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils.timer import MultiTimer from model_zoo.gpt import GPTLMLoss from torch.nn import functional as F @@ -268,3 +268,4 @@ def train(): return_output_label=False, ) ``` + diff --git a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md index 6adfe4f113da..3475d8f070f5 100644 --- a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md @@ -38,7 +38,7 @@ from colossalai.builder import build_pipeline_model from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils import MultiTimer, get_dataloader from timm.models import vision_transformer as vit from torchvision import transforms @@ -245,3 +245,4 @@ def train(): hooks=hook_list, display_progress=True) ``` + diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md index a2deaeb88893..5b0b694b3153 100644 --- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -79,7 +79,7 @@ from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.lr_scheduler import LinearWarmupLR from colossalai.nn.metric import Accuracy -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks ``` - Other modules @@ -644,3 +644,4 @@ torchrun --standalone --nproc_per_node train_hybrid.py --config ./co # If your torch >= 1.9.0 # python -m torch.distributed.run --standalone --nproc_per_node= train_hybrid.py --config ./configs/config_hybrid_parallel.py ``` + diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md index d2f99563f042..6d2355ad9044 100644 --- a/docs/source/en/basics/engine_trainer.md +++ b/docs/source/en/basics/engine_trainer.md @@ -64,7 +64,7 @@ Trainer is a more high-level wrapper for the user to execute training with fewer ```python from colossalai.logging import get_dist_logger -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks # build components and initialize with colossalai.initialize ... @@ -107,7 +107,7 @@ If you want to customize your own hook class, you can inherit `hooks.BaseHook` a ```python from colossalai.logging import get_dist_logger -from colossalai.trainer import hooks +from colossalai.legacy.trainer import hooks class LogMessageHook(hooks.BaseHook): @@ -345,7 +345,7 @@ If you wish to train with a trainer object, you can follow the code snippet belo ```python from colossalai.nn.metric import Accuracy -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks # create a trainer object @@ -387,3 +387,4 @@ python -m torch.distributed.launch --nproc_per_node --master_addr loc # with trainer python -m torch.distributed.launch --nproc_per_node --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py ``` + diff --git a/docs/source/en/basics/model_checkpoint.md b/docs/source/en/basics/model_checkpoint.md index 70334f1c41e7..c3ba5b04bca2 100644 --- a/docs/source/en/basics/model_checkpoint.md +++ b/docs/source/en/basics/model_checkpoint.md @@ -41,7 +41,7 @@ for epoch in range(num_epochs): #### Save when using trainer ```python -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks model = ... engine, _, _, _ = colossalai.initialize(model=model, ...) trainer = Trainer(engine, ...) @@ -61,3 +61,4 @@ model = ... load_checkpoint('xxx.pt', model) ... # train or test ``` + diff --git a/docs/source/en/features/mixed_precision_training.md b/docs/source/en/features/mixed_precision_training.md index 8579d586ed5f..164b2a21598c 100644 --- a/docs/source/en/features/mixed_precision_training.md +++ b/docs/source/en/features/mixed_precision_training.md @@ -267,7 +267,7 @@ from pathlib import Path from colossalai.core import global_context as gpc from colossalai.logging import get_dist_logger from colossalai.utils import get_dataloader -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.nn.lr_scheduler import LinearWarmupLR from timm.models import vit_base_patch16_224 from torchvision import datasets, transforms diff --git a/docs/source/en/features/pipeline_parallel.md b/docs/source/en/features/pipeline_parallel.md index 30654b0b0195..8b5f228a9e5e 100644 --- a/docs/source/en/features/pipeline_parallel.md +++ b/docs/source/en/features/pipeline_parallel.md @@ -79,7 +79,7 @@ import colossalai.nn as col_nn from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils import MultiTimer, get_dataloader from colossalai.context import ParallelMode from colossalai.pipeline.pipelinable import PipelinableContext @@ -157,3 +157,4 @@ trainer.fit(train_dataloader=train_dataloader, ``` We use `2` pipeline stages and the batch will be split into `4` micro batches. + diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md index 6c6dcf6e850d..a199d31e7242 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md @@ -43,7 +43,7 @@ from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils.timer import MultiTimer from model_zoo.gpt import GPTLMLoss from torch.nn import functional as F @@ -273,3 +273,4 @@ def train(): return_output_label=False, ) ``` + diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md index 495c7fa36cc1..d3a98c89b48e 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md @@ -36,7 +36,7 @@ from colossalai.builder import build_pipeline_model from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils import MultiTimer, get_dataloader from timm.models import vision_transformer as vit from torchvision import transforms @@ -244,3 +244,4 @@ def train(): hooks=hook_list, display_progress=True) ``` + diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md index 5ad08392049e..ddc2502f05da 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -74,7 +74,7 @@ from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.lr_scheduler import LinearWarmupLR from colossalai.nn.metric import Accuracy -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks ``` - 其他模块 @@ -589,3 +589,4 @@ torchrun --standalone --nproc_per_node train_hybrid.py --config ./co # If your torch >= 1.9.0 # python -m torch.distributed.run --standalone --nproc_per_node= train_hybrid.py --config ./configs/config_hybrid_parallel.py ``` + diff --git a/docs/source/zh-Hans/basics/engine_trainer.md b/docs/source/zh-Hans/basics/engine_trainer.md index a35bd87c44e1..e57220292c98 100644 --- a/docs/source/zh-Hans/basics/engine_trainer.md +++ b/docs/source/zh-Hans/basics/engine_trainer.md @@ -61,7 +61,7 @@ Trainer 的参数 `schedule` 默认值是 `None` 。在大多数情况下,除 ```python from colossalai.logging import get_dist_logger -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks # build components and initialize with colossalai.initialize ... @@ -104,7 +104,7 @@ trainer.fit( ```python from colossalai.logging import get_dist_logger -from colossalai.trainer import hooks +from colossalai.legacy.trainer import hooks class LogMessageHook(hooks.BaseHook): @@ -341,7 +341,7 @@ for epoch in range(gpc.config.NUM_EPOCHS): ```python from colossalai.nn.metric import Accuracy -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks # create a trainer object @@ -384,3 +384,4 @@ python -m torch.distributed.launch --nproc_per_node --master_addr loc # with trainer python -m torch.distributed.launch --nproc_per_node --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py ``` + diff --git a/docs/source/zh-Hans/basics/model_checkpoint.md b/docs/source/zh-Hans/basics/model_checkpoint.md index a5374b7509c9..4a49d373a2a4 100644 --- a/docs/source/zh-Hans/basics/model_checkpoint.md +++ b/docs/source/zh-Hans/basics/model_checkpoint.md @@ -41,7 +41,7 @@ for epoch in range(num_epochs): #### 用 trainer 保存 ```python -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks model = ... engine, _, _, _ = colossalai.initialize(model=model, ...) trainer = Trainer(engine, ...) @@ -61,3 +61,4 @@ model = ... load_checkpoint('xxx.pt', model) ... # train or test ``` + diff --git a/docs/source/zh-Hans/features/mixed_precision_training.md b/docs/source/zh-Hans/features/mixed_precision_training.md index a92e7e093015..35a73f1adbcd 100644 --- a/docs/source/zh-Hans/features/mixed_precision_training.md +++ b/docs/source/zh-Hans/features/mixed_precision_training.md @@ -245,7 +245,7 @@ from pathlib import Path from colossalai.core import global_context as gpc from colossalai.logging import get_dist_logger from colossalai.utils import get_dataloader -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.nn.lr_scheduler import LinearWarmupLR from timm.models import vit_base_patch16_224 from torchvision import datasets, transforms diff --git a/docs/source/zh-Hans/features/pipeline_parallel.md b/docs/source/zh-Hans/features/pipeline_parallel.md index 98096b1d7f93..1497dc399f6c 100644 --- a/docs/source/zh-Hans/features/pipeline_parallel.md +++ b/docs/source/zh-Hans/features/pipeline_parallel.md @@ -78,7 +78,7 @@ import colossalai.nn as col_nn from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.trainer import Trainer, hooks +from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils import MultiTimer, get_dataloader from colossalai.context import ParallelMode from colossalai.pipeline.pipelinable import PipelinableContext @@ -156,3 +156,4 @@ trainer.fit(train_dataloader=train_dataloader, ``` 我们使用 `2` 个流水段,并且 batch 将被切分为 `4` 个 micro batches。 + diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py index 6be0b9e8da30..b239b626c07f 100644 --- a/examples/language/gpt/titans/train_gpt.py +++ b/examples/language/gpt/titans/train_gpt.py @@ -10,9 +10,9 @@ import colossalai.utils as utils from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.trainer import Trainer, hooks from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn import LinearWarmupLR -from colossalai.trainer import Trainer, hooks from colossalai.utils import colo_set_process_memory_fraction, is_using_pp from colossalai.utils.timer import MultiTimer from colossalai.zero.legacy.init_ctx import ZeroInitContext diff --git a/pytest.ini b/pytest.ini index d25865d52ae9..b869bb4fa116 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,4 +4,4 @@ markers = gpu: tests which requires a single GPU dist: tests which are run in a multi-GPU or multi-machine environment experiment: tests for experimental features -addopts = --ignore=tests/test_analyzer --ignore=tests/test_auto_parallel --ignore=tests/test_autochunk --ignore=tests/test_moe --ignore=tests/test_fx +addopts = --ignore=tests/test_analyzer --ignore=tests/test_auto_parallel --ignore=tests/test_autochunk --ignore=tests/test_moe --ignore=tests/test_fx --ignore=tests/test_legacy diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py deleted file mode 100644 index 4992acbd7cc2..000000000000 --- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -from pathlib import Path - -import pytest -import torch -from torchvision import transforms -from torchvision.datasets import CIFAR10 - -import colossalai -from colossalai.amp import AMP_TYPE -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_dist_logger -from colossalai.nn import CrossEntropyLoss -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.pipeline.pipelinable import PipelinableContext -from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn -from colossalai.trainer import Trainer, hooks -from colossalai.utils import get_dataloader - -BATCH_SIZE = 4 -NUM_EPOCHS = 60 -WARMUP_EPOCHS = 5 -CONFIG = dict(NUM_MICRO_BATCHES=2, - parallel=dict(pipeline=2, tensor=dict(size=2, mode='1d')), - fp16=dict(mode=AMP_TYPE.NAIVE), - gradient_accumulation=2) - - -def run_trainer(rank, world_size, port): - colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') - - logger = get_dist_logger() - - # get logger - logger = get_dist_logger() - - pipelinable = PipelinableContext() - try: - from titans.model.vit import vit_tiny_patch4_32 - except ImportError: - logger.warning('skip the test_cifar_with_data_pipeline_tensor test because titan is not installed') - logger.warning('please install titan from https://github.com/hpcaitech/Titans') - return - with pipelinable: - model = vit_tiny_patch4_32() - pipelinable.to_layer_list() - pipelinable.policy = "uniform" - model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE)) - - # create dataloaders - root = Path(os.environ['DATA']) - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4, pad_if_needed=True), - transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) - train_dataset = CIFAR10(root=root, train=True, download=True, transform=transform_train) - train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True) - - # create loss function - criterion = CrossEntropyLoss(label_smoothing=0.1) - - # create optimizer - optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0) - - # create lr scheduler - lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS) - - # initialize - engine, train_dataloader, *_ = colossalai.initialize(model=model, - optimizer=optimizer, - criterion=criterion, - train_dataloader=train_dataloader) - - logger = get_dist_logger() - - trainer = Trainer(engine=engine, logger=logger) - - hook_list = [ - hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False), - ] - - trainer.fit(train_dataloader=train_dataloader, - epochs=NUM_EPOCHS, - max_steps=2, - hooks=hook_list, - display_progress=True) - - -@pytest.mark.dist -@skip_if_not_enough_gpus(min_gpus=8) -@rerun_if_address_is_in_use() -def test_hybrid_parallel(): - spawn(run_trainer, 8) - - -if __name__ == '__main__': - test_hybrid_parallel() diff --git a/tests/test_trainer/test_pipeline/test_p2p.py b/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py similarity index 100% rename from tests/test_trainer/test_pipeline/test_p2p.py rename to tests/test_legacy/test_trainer/test_pipeline/test_p2p.py diff --git a/tests/test_trainer/test_pipeline/test_pipeline_schedule.py b/tests/test_legacy/test_trainer/test_pipeline/test_pipeline_schedule.py similarity index 100% rename from tests/test_trainer/test_pipeline/test_pipeline_schedule.py rename to tests/test_legacy/test_trainer/test_pipeline/test_pipeline_schedule.py diff --git a/tests/test_trainer/test_trainer_with_non_pipe_schedule.py b/tests/test_legacy/test_trainer/test_trainer_with_non_pipe_schedule.py similarity index 97% rename from tests/test_trainer/test_trainer_with_non_pipe_schedule.py rename to tests/test_legacy/test_trainer/test_trainer_with_non_pipe_schedule.py index 753f82222f9d..dab0e53a4c32 100644 --- a/tests/test_trainer/test_trainer_with_non_pipe_schedule.py +++ b/tests/test_legacy/test_trainer/test_trainer_with_non_pipe_schedule.py @@ -3,9 +3,9 @@ import colossalai from colossalai.amp.amp_type import AMP_TYPE +from colossalai.legacy.trainer import Trainer from colossalai.logging import get_dist_logger from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from colossalai.trainer import Trainer from colossalai.utils import MultiTimer from tests.components_to_test.registry import non_distributed_component_funcs diff --git a/tests/test_trainer/test_trainer_with_pipe_schedule.py b/tests/test_legacy/test_trainer/test_trainer_with_pipe_schedule.py similarity index 98% rename from tests/test_trainer/test_trainer_with_pipe_schedule.py rename to tests/test_legacy/test_trainer/test_trainer_with_pipe_schedule.py index bb63d51a0b65..7dfbec854ccc 100644 --- a/tests/test_trainer/test_trainer_with_pipe_schedule.py +++ b/tests/test_legacy/test_trainer/test_trainer_with_pipe_schedule.py @@ -12,9 +12,9 @@ import colossalai from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.trainer import Trainer from colossalai.logging import get_dist_logger from colossalai.testing import rerun_if_address_is_in_use, spawn -from colossalai.trainer import Trainer from colossalai.utils import MultiTimer, get_dataloader BATCH_SIZE = 4 diff --git a/tests/test_pipeline/test_cuda_rpc_performance.py b/tests/test_pipeline/test_cuda_rpc_performance.py index 6a0509555862..4bacb2181ef9 100644 --- a/tests/test_pipeline/test_cuda_rpc_performance.py +++ b/tests/test_pipeline/test_cuda_rpc_performance.py @@ -1,25 +1,16 @@ import os -from typing import Callable, List, Optional, Type, Union import time import pytest import torch import torch.nn as nn +from rpc_test_utils import parse_args, rpc_run from titans.dataloader.cifar10 import build_cifar from torchvision.models import resnet50 -from torchvision.models.resnet import BasicBlock, Bottleneck, conv1x1 from tqdm import tqdm -from rpc_test_utils import rpc_run, parse_args -import colossalai -import colossalai.nn as col_nn -from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.trainer import Trainer, hooks -from colossalai.utils import MultiTimer, get_dataloader -from colossalai.context import ParallelMode -from colossalai.pipeline.pipelinable import PipelinableContext, PipelinableModel -from colossalai.pipeline.rpc import OneFOneBPipelineEngine, ChimeraPipelineEngine -from colossalai.pipeline.pipeline_process_group import ppg +from colossalai.pipeline.pipelinable import PipelinableContext +from colossalai.pipeline.rpc import OneFOneBPipelineEngine def flatten(x):