From a1fbf94cf6101106716b08d20dd4f4f24d1c067e Mon Sep 17 00:00:00 2001 From: Saaketh Date: Wed, 31 Jul 2024 15:57:29 -0400 Subject: [PATCH 1/9] yo --- composer/devices/device_gpu.py | 3 +++ tests/checkpoint/test_state_dict.py | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index 19cb0a774a..401368576e 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -12,6 +12,7 @@ import torch.backends.cudnn import torch.cuda import torch.cuda.amp +import torch.distributed as torch_dist import torch.utils.data from composer.devices.device import Device @@ -42,6 +43,8 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') + if torch_dist.is_gloo_available(): + DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: device_id = dist.get_local_rank() self._device = torch.device(f'cuda:{device_id}') diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py index 12fde27249..939789b364 100644 --- a/tests/checkpoint/test_state_dict.py +++ b/tests/checkpoint/test_state_dict.py @@ -6,6 +6,7 @@ import pytest import torch +import torch.distributed as torch_dist from packaging import version from torch.distributed.fsdp import FullyShardedDataParallel as FSDP @@ -433,7 +434,10 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz assert 'model_name' in metadata_sd assert 'dist_backend' in metadata_sd - assert metadata_sd['dist_backend'] == 'nccl' + if torch_dist.is_gloo_available(): + assert metadata_sd['dist_backend'] == 'cuda:nccl,cpu:gloo' + else: + assert metadata_sd['dist_backend'] == 'nccl' @pytest.mark.filterwarnings('ignore:SWA has') From 8ff4dfcabfc3a2573e8be0bf6b730cfef56ac9d6 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Fri, 2 Aug 2024 13:40:12 -0400 Subject: [PATCH 2/9] yo --- .github/workflows/daily.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index aa97c755c8..025ae8c4cc 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -124,12 +124,12 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 1 - - name: "gpu-3.10-2.1-2-gpu" - container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - pytest_command: "coverage run -m pytest" - composer_package_name: "mosaicml" - gpu_num: 2 + # - name: "gpu-3.10-2.1-2-gpu" + # container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 + # markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + # pytest_command: "coverage run -m pytest" + # composer_package_name: "mosaicml" + # gpu_num: 2 - name: "gpu-3.11-2.2-2-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" From 1f73c7d03e6dabaeda56dbd5d08a89ec242ce7a5 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Fri, 2 Aug 2024 16:18:40 -0400 Subject: [PATCH 3/9] yo --- .github/workflows/daily.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 025ae8c4cc..527c37819e 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -130,12 +130,12 @@ jobs: # pytest_command: "coverage run -m pytest" # composer_package_name: "mosaicml" # gpu_num: 2 - - name: "gpu-3.11-2.2-2-gpu" - container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 - markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - pytest_command: "coverage run -m pytest" - composer_package_name: "mosaicml" - gpu_num: 2 + # - name: "gpu-3.11-2.2-2-gpu" + # container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 + # markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + # pytest_command: "coverage run -m pytest" + # composer_package_name: "mosaicml" + # gpu_num: 2 - name: "gpu-3.11-2.3-2-gpu" container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" From 46e93122d42c30b8dfe7f3694dfa760c4b3cf01f Mon Sep 17 00:00:00 2001 From: Saaketh Date: Fri, 2 Aug 2024 16:48:37 -0400 Subject: [PATCH 4/9] yo --- .github/workflows/daily.yaml | 24 ++++++++++++------------ composer/devices/device_gpu.py | 4 +++- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index 527c37819e..aa97c755c8 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -124,18 +124,18 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 1 - # - name: "gpu-3.10-2.1-2-gpu" - # container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - # markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - # pytest_command: "coverage run -m pytest" - # composer_package_name: "mosaicml" - # gpu_num: 2 - # - name: "gpu-3.11-2.2-2-gpu" - # container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 - # markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - # pytest_command: "coverage run -m pytest" - # composer_package_name: "mosaicml" - # gpu_num: 2 + - name: "gpu-3.10-2.1-2-gpu" + container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 2 + - name: "gpu-3.11-2.2-2-gpu" + container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 + markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" + pytest_command: "coverage run -m pytest" + composer_package_name: "mosaicml" + gpu_num: 2 - name: "gpu-3.11-2.3-2-gpu" container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index 401368576e..00578a8171 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -14,6 +14,7 @@ import torch.cuda.amp import torch.distributed as torch_dist import torch.utils.data +from packaging import version from composer.devices.device import Device from composer.utils import dist @@ -43,7 +44,8 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') - if torch_dist.is_gloo_available(): + if torch_dist.is_gloo_available() or version.parse(torch.__version__) >= version.parse('2.3.0.dev'): + # Composer checkpoint load / save from before torch 2.3.0 onwards is not compatible with gloo + nccl backends. DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: device_id = dist.get_local_rank() From 78583a44bc90a80e03ea63d6d40facdb601cd083 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Mon, 5 Aug 2024 12:54:20 -0400 Subject: [PATCH 5/9] yo --- composer/devices/device_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index 00578a8171..a6587c1863 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -44,8 +44,8 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') - if torch_dist.is_gloo_available() or version.parse(torch.__version__) >= version.parse('2.3.0.dev'): - # Composer checkpoint load / save from before torch 2.3.0 onwards is not compatible with gloo + nccl backends. + if torch_dist.is_gloo_available() or version.parse(torch.__version__) >= version.parse('2.3.0'): + # Composer checkpoint load / save from before torch 2.3.0 is not compatible with gloo + nccl backends. DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: device_id = dist.get_local_rank() From e75fa2398fa2b7fa596caec18473abd14f9f4af8 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Mon, 5 Aug 2024 13:56:41 -0400 Subject: [PATCH 6/9] yo --- composer/devices/device_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index a6587c1863..c17dee3a3a 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -44,7 +44,7 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') - if torch_dist.is_gloo_available() or version.parse(torch.__version__) >= version.parse('2.3.0'): + if torch_dist.is_gloo_available() and version.parse(torch.__version__) >= version.parse('2.3.0'): # Composer checkpoint load / save from before torch 2.3.0 is not compatible with gloo + nccl backends. DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: From 69ab6178e0d57224769312ff336c24c7fee65a32 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Mon, 5 Aug 2024 14:00:01 -0400 Subject: [PATCH 7/9] yo --- tests/checkpoint/test_state_dict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkpoint/test_state_dict.py b/tests/checkpoint/test_state_dict.py index cc7280f93b..99d9146aae 100644 --- a/tests/checkpoint/test_state_dict.py +++ b/tests/checkpoint/test_state_dict.py @@ -447,7 +447,7 @@ def test_get_metadata_sharded_model(model_type: str, tensor_type: str, world_siz assert 'model_name' in metadata_sd assert 'dist_backend' in metadata_sd - if torch_dist.is_gloo_available(): + if torch_dist.is_gloo_available() and version.parse(torch.__version__) >= version.parse('2.3.0'): assert metadata_sd['dist_backend'] == 'cuda:nccl,cpu:gloo' else: assert metadata_sd['dist_backend'] == 'nccl' From d2d33b646ef7b481abadeff99b05615b9d0f3065 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Mon, 5 Aug 2024 14:46:13 -0400 Subject: [PATCH 8/9] yo --- composer/devices/device_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index c17dee3a3a..5d2f406026 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -44,7 +44,7 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') - if torch_dist.is_gloo_available() and version.parse(torch.__version__) >= version.parse('2.3.0'): + if torch_dist.is_gloo_available() #and version.parse(torch.__version__) >= version.parse('2.3.0'): # Composer checkpoint load / save from before torch 2.3.0 is not compatible with gloo + nccl backends. DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: From 60f530a5fff55088beef6070a0d0e190fdb9c16a Mon Sep 17 00:00:00 2001 From: Saaketh Date: Mon, 5 Aug 2024 14:47:13 -0400 Subject: [PATCH 9/9] slam --- composer/devices/device_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/devices/device_gpu.py b/composer/devices/device_gpu.py index 5d2f406026..c17dee3a3a 100644 --- a/composer/devices/device_gpu.py +++ b/composer/devices/device_gpu.py @@ -44,7 +44,7 @@ def __init__( ): if not torch.cuda.is_available(): raise ValueError('DeviceGPU cannot be created as torch.cuda is not available.') - if torch_dist.is_gloo_available() #and version.parse(torch.__version__) >= version.parse('2.3.0'): + if torch_dist.is_gloo_available() and version.parse(torch.__version__) >= version.parse('2.3.0'): # Composer checkpoint load / save from before torch 2.3.0 is not compatible with gloo + nccl backends. DeviceGPU.dist_backend = 'cuda:nccl,cpu:gloo' if device_id is None: