From 66573ec84cdcf99d0562fcffbaea90f77dfd6ede Mon Sep 17 00:00:00 2001 From: Parth Raut <68670266+parthraut@users.noreply.github.com> Date: Thu, 12 Dec 2024 21:01:25 -0500 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Jae-Won Chung --- examples/power_limit_optimizer/README.md | 10 ++------- zeus/optimizer/power_limit.py | 27 ++++++++++++------------ zeus/utils/framework.py | 4 ++-- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/examples/power_limit_optimizer/README.md b/examples/power_limit_optimizer/README.md index 5dab45c2..ca79eaf6 100644 --- a/examples/power_limit_optimizer/README.md +++ b/examples/power_limit_optimizer/README.md @@ -26,14 +26,14 @@ You just need to download and extract the ImageNet data and mount it to the Dock ## Multi-GPU Distributed Training (Pytorch DDP and FSDP) -When using `ZeusMonitor` and/or `GlobalPowerLimitOptimizer` in a multi-GPU Distributed context, launch one instance of `ZeusMonitor` and/or `GlobalPowerLimitOptimizer` per local rank (per GPU on each node), and pass in the local rank to `ZeusMonitor` as shown below: +When using `ZeusMonitor` and/or `GlobalPowerLimitOptimizer` in a multi-GPU Distributed context, construct one instance of `ZeusMonitor` and/or `GlobalPowerLimitOptimizer` per local rank (per GPU on each node), and pass in the local rank to `ZeusMonitor` as shown below: ```python monitor = ZeusMonitor(gpu_indices=[local_rank]) # pass in local rank to gpu_indices. plo = GlobalPowerLimitOptimizer(monitor) ``` -Ensure that only one GPU is monitored per `ZeusMonitor`. Internally, `GlobalPowerLimitOptimizer` performs an [All-Reduce](https://pytorch.org/docs/stable/distributed.html) to synchronize before making a power limit decision. +Ensure that only one GPU is monitored per `ZeusMonitor`. Internally, `GlobalPowerLimitOptimizer` performs an [AllReduce](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html) to aggregate time and energy measurements across all GPUs before making a power limit decision. ## Example command @@ -59,12 +59,6 @@ torchrun \ --nnodes 1 \ --nproc_per_node=gpu `# Number of processes per node, should be equal to the number of GPUs.` \ train_fsdp.py \ - --batch-size 64 `# Batch size for training.` \ - --test-batch-size 1000 `# Batch size for testing.` \ - --epochs 10 `# Number of epochs to train.` \ - --lr 1.0 `# Learning rate.` \ - --gamma 0.7 `# Learning rate step gamma.` \ - --save-model `# Save the trained model.` \ [DATA_DIR] ``` diff --git a/zeus/optimizer/power_limit.py b/zeus/optimizer/power_limit.py index d711c8c3..73234e02 100644 --- a/zeus/optimizer/power_limit.py +++ b/zeus/optimizer/power_limit.py @@ -203,23 +203,22 @@ class GlobalPowerLimitOptimizer(Callback): This optimizer uses the JIT profiling log to determine the optimal power limit. - Non-distributed training (Single GPU or Multi-GPU on a single node): - Launch one instance of `ZeusMonitor` and `GlobalPowerLimitOptimizer`, and have `ZeusMonitor` track all desired GPUs. - For example, to track all GPUs on a single node: + ## Usage with distributed data parallelism + + The global power limit optimizer expects one process to control each GPU used for training. + For instance, `torchrun` will automatically spawn one process for each GPU on the node. + Correspondingly, the [`ZeusMonitor`][zeus.monitor.energy.ZeusMonitor] instance passed in + should be monitoring **one GPU**: the one being managed by the current process. The index of + this GPU would typically match the local rank of the process. In the case of PyTorch, users would have + called `torch.cuda.set_device` early on, so `torch.cuda.current_device` will give you the GPU index. + `GlobalPowerLimitOptimizer` will internally do an AllReduce across all GPUs to aggregate + time and energy measurements, and then select the globally optimal power limit. + + ```python - monitor = ZeusMonitor(gpu_indices=None) # monitor all GPUs + monitor = ZeusMonitor(gpu_indices=[local_rank]) # pass in local rank to gpu_indices. plo = GlobalPowerLimitOptimizer(monitor) ``` - - Distributed training (Multi-GPU on multiple nodes): - `ZeusMonitor` and `GlobalPowerLimitOptimizer` make the assumption that each GPU is monitored by one and only one instance of `ZeusMonitor` to ensure correct functionality. - Therefore, it is recommended to launch one instance of `ZeusMonitor` and `GlobalPowerLimitOptimizer` - per device (per GPU on each node), and pass in the local rank to `ZeusMonitor` as shown below: - ```python - monitor = ZeusMonitor(gpu_indices=[local_rank]) # pass in local rank to gpu_indices. - plo = GlobalPowerLimitOptimizer(monitor) - ``` - Internally, `GlobalPowerLimitOptimizer` performs an all-reduce over all devices to compute the optimal power limit. """ def __init__( diff --git a/zeus/utils/framework.py b/zeus/utils/framework.py index 3459eda2..b56b5c56 100644 --- a/zeus/utils/framework.py +++ b/zeus/utils/framework.py @@ -3,7 +3,7 @@ from __future__ import annotations import types -from typing import Literal, List +from typing import Literal from functools import lru_cache from zeus.utils.logging import get_logger @@ -105,7 +105,7 @@ def sync_execution( def all_reduce( - object: List[int] | List[float], operation: Literal["sum", "max"] + object: list[int] | list[float], operation: Literal["sum", "max"] ) -> int | float: """Reduce objects from all replicas through the specified operation.