diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index f133abab6d..92972501ca 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -26,7 +26,7 @@ - mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.0 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws +- AWS_OFI_NCCL_VERSION: v1.9.1-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-3-0-cu121-aws @@ -94,7 +94,7 @@ - mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.1 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws +- AWS_OFI_NCCL_VERSION: v1.9.1-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-2-1-cu121-aws @@ -162,7 +162,7 @@ - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws +- AWS_OFI_NCCL_VERSION: v1.9.1-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-1-2-cu121-aws diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index c8cec98d25..a652dbb39c 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -224,7 +224,7 @@ def _main(): if interconnect != 'EFA': entry['AWS_OFI_NCCL_VERSION'] = '' else: - entry['AWS_OFI_NCCL_VERSION'] = 'v1.7.4-aws' + entry['AWS_OFI_NCCL_VERSION'] = 'v1.9.1-aws' pytorch_entries.append(entry)