From c294625d7c11d12090473a4795738448ff7c31e8 Mon Sep 17 00:00:00 2001 From: JackCaoG <59073027+JackCaoG@users.noreply.github.com> Date: Wed, 22 May 2024 16:46:17 -0700 Subject: [PATCH] reorganize the example dir (#7097) --- examples/data_parallel/README.md | 2 ++ examples/{ => data_parallel}/train_resnet_ddp.py | 6 ++++++ .../{ => data_parallel}/train_resnet_spmd_data_parallel.py | 4 ++++ examples/{ => data_parallel}/train_resnet_xla_ddp.py | 7 +++++++ examples/{ => debug}/train_resnet_benchmark.py | 4 ++++ examples/{ => debug}/train_resnet_profile.py | 5 ++++- examples/fsdp/README.md | 2 ++ examples/{ => fsdp}/train_decoder_only_fsdp_v2.py | 4 ++++ examples/{ => fsdp}/train_resnet_fsdp_auto_wrap.py | 6 +++++- 9 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 examples/data_parallel/README.md rename examples/{ => data_parallel}/train_resnet_ddp.py (74%) rename examples/{ => data_parallel}/train_resnet_spmd_data_parallel.py (92%) rename examples/{ => data_parallel}/train_resnet_xla_ddp.py (55%) rename examples/{ => debug}/train_resnet_benchmark.py (93%) rename examples/{ => debug}/train_resnet_profile.py (88%) create mode 100644 examples/fsdp/README.md rename examples/{ => fsdp}/train_decoder_only_fsdp_v2.py (94%) rename examples/{ => fsdp}/train_resnet_fsdp_auto_wrap.py (89%) diff --git a/examples/data_parallel/README.md b/examples/data_parallel/README.md new file mode 100644 index 00000000000..2be94bce14e --- /dev/null +++ b/examples/data_parallel/README.md @@ -0,0 +1,2 @@ +## Recommendation +Please consider using `train_resnet_spmd_data_parallel.py` since it uses SPMD internally and are very likely yield better perfomrance. diff --git a/examples/train_resnet_ddp.py b/examples/data_parallel/train_resnet_ddp.py similarity index 74% rename from examples/train_resnet_ddp.py rename to examples/data_parallel/train_resnet_ddp.py index fd9e0b7f3b4..898983a714b 100644 --- a/examples/train_resnet_ddp.py +++ b/examples/data_parallel/train_resnet_ddp.py @@ -1,4 +1,9 @@ +import sys +import os +example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +sys.path.append(example_folder) from train_resnet_base import TrainResNetBase + import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP import torch.optim as optim @@ -21,4 +26,5 @@ def _mp_fn(index): if __name__ == '__main__': + print('consider using train_resnet_spmd_data_parallel.py instead to get better performance') xmp.spawn(_mp_fn, args=()) diff --git a/examples/train_resnet_spmd_data_parallel.py b/examples/data_parallel/train_resnet_spmd_data_parallel.py similarity index 92% rename from examples/train_resnet_spmd_data_parallel.py rename to examples/data_parallel/train_resnet_spmd_data_parallel.py index 7aa53a7bf9a..3a5eaca39a5 100644 --- a/examples/train_resnet_spmd_data_parallel.py +++ b/examples/data_parallel/train_resnet_spmd_data_parallel.py @@ -1,3 +1,7 @@ +import sys +import os +example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +sys.path.append(example_folder) from train_resnet_base import TrainResNetBase import numpy as np diff --git a/examples/train_resnet_xla_ddp.py b/examples/data_parallel/train_resnet_xla_ddp.py similarity index 55% rename from examples/train_resnet_xla_ddp.py rename to examples/data_parallel/train_resnet_xla_ddp.py index 83b27e46878..4ac99904422 100644 --- a/examples/train_resnet_xla_ddp.py +++ b/examples/data_parallel/train_resnet_xla_ddp.py @@ -1,4 +1,9 @@ +import sys +import os +example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +sys.path.append(example_folder) from train_resnet_base import TrainResNetBase + import torch_xla.distributed.xla_multiprocessing as xmp import torch_xla.core.xla_model as xm @@ -6,6 +11,7 @@ class TrainResNetXLADDP(TrainResNetBase): def run_optimizer(self): + # optimizer_step will call `optimizer.step()` and all_reduce the gradident xm.optimizer_step(self.optimizer) @@ -15,4 +21,5 @@ def _mp_fn(index): if __name__ == '__main__': + print('consider using train_resnet_spmd_data_parallel.py instead to get better performance') xmp.spawn(_mp_fn, args=()) diff --git a/examples/train_resnet_benchmark.py b/examples/debug/train_resnet_benchmark.py similarity index 93% rename from examples/train_resnet_benchmark.py rename to examples/debug/train_resnet_benchmark.py index cca86fcb08d..50afe7f807f 100644 --- a/examples/train_resnet_benchmark.py +++ b/examples/debug/train_resnet_benchmark.py @@ -1,3 +1,7 @@ +import sys +import os +example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +sys.path.append(example_folder) from train_resnet_base import TrainResNetBase import itertools diff --git a/examples/train_resnet_profile.py b/examples/debug/train_resnet_profile.py similarity index 88% rename from examples/train_resnet_profile.py rename to examples/debug/train_resnet_profile.py index 4846953b62d..886c25f65b7 100644 --- a/examples/train_resnet_profile.py +++ b/examples/debug/train_resnet_profile.py @@ -1,6 +1,9 @@ import os - +import os +example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +sys.path.append(example_folder) from train_resnet_base import TrainResNetBase + import torch_xla.debug.profiler as xp # check https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md#environment-variables diff --git a/examples/fsdp/README.md b/examples/fsdp/README.md new file mode 100644 index 00000000000..761ae64e16d --- /dev/null +++ b/examples/fsdp/README.md @@ -0,0 +1,2 @@ +## Recommendation +Please consider using `train_decoder_only_fsdp_v2.py` since it uses SPMD internally and are very likely yield better perfomrance. diff --git a/examples/train_decoder_only_fsdp_v2.py b/examples/fsdp/train_decoder_only_fsdp_v2.py similarity index 94% rename from examples/train_decoder_only_fsdp_v2.py rename to examples/fsdp/train_decoder_only_fsdp_v2.py index 17e68ba6d5b..71d50da7f38 100644 --- a/examples/train_decoder_only_fsdp_v2.py +++ b/examples/fsdp/train_decoder_only_fsdp_v2.py @@ -1,3 +1,7 @@ +import sys +import os +example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +sys.path.append(example_folder) import decoder_only_model from train_decoder_only_base import TrainDecoderOnlyBase diff --git a/examples/train_resnet_fsdp_auto_wrap.py b/examples/fsdp/train_resnet_fsdp_auto_wrap.py similarity index 89% rename from examples/train_resnet_fsdp_auto_wrap.py rename to examples/fsdp/train_resnet_fsdp_auto_wrap.py index 4b0e0edab1e..a7dc5679d00 100644 --- a/examples/train_resnet_fsdp_auto_wrap.py +++ b/examples/fsdp/train_resnet_fsdp_auto_wrap.py @@ -1,3 +1,7 @@ +import sys +import os +example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) +sys.path.append(example_folder) from train_resnet_base import TrainResNetBase from functools import partial @@ -10,7 +14,6 @@ from torch_xla.distributed.fsdp.wrap import (size_based_auto_wrap_policy, transformer_auto_wrap_policy) - class TrainResNetXLAFSDP(TrainResNetBase): def __init__(self): @@ -49,4 +52,5 @@ def _mp_fn(index): if __name__ == '__main__': + print('consider using train_decoder_only_fsdp_v2.py instead to get better performance') xmp.spawn(_mp_fn, args=())