From 4fe3ea7e14426ec99bf02dfda2709e5dbcff0580 Mon Sep 17 00:00:00 2001 From: "will.gleich" <22464726+willgleich@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:42:00 -0600 Subject: [PATCH] Bump aws_ofi_nccl to 1.11.0 --- docker/build_matrix.yaml | 6 +++--- docker/generate_build_matrix.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index ee74d12309..66a8a192e6 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -27,7 +27,7 @@ - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 -- AWS_OFI_NCCL_VERSION: v1.9.1-aws +- AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-3-1-cu121-aws @@ -97,7 +97,7 @@ - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 -- AWS_OFI_NCCL_VERSION: v1.9.1-aws +- AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-2-2-cu121-aws @@ -165,7 +165,7 @@ - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: v1.9.1-aws +- AWS_OFI_NCCL_VERSION: v1.11.0-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-1-2-cu121-aws diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 74d9c7fed4..85850334be 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -224,7 +224,7 @@ def _main(): if interconnect != 'EFA': entry['AWS_OFI_NCCL_VERSION'] = '' else: - entry['AWS_OFI_NCCL_VERSION'] = 'v1.9.1-aws' + entry['AWS_OFI_NCCL_VERSION'] = 'v1.11.0-aws' pytorch_entries.append(entry)