From 5086512e7046c9fe1ce3cbd5c1712a610e73027a Mon Sep 17 00:00:00 2001 From: Verdi March Date: Fri, 12 Apr 2024 10:23:45 +0800 Subject: [PATCH] Update template pytorch dockerfile with cuda sync memops stanza --- .../containers/pytorch/0.nvcr-pytorch-aws.dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile index 6f446e3e..49ab8c03 100644 --- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile +++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile @@ -23,11 +23,15 @@ FROM nvcr.io/nvidia/pytorch:23.12-py3 ENV DEBIAN_FRONTEND=noninteractive # The three must-be-built packages. -# Efa-installer>=1.29.0 required for nccl>=2.19.0 to avoid libfabric NCCL error. +# Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error. ENV EFA_INSTALLER_VERSION=1.30.0 ENV AWS_OFI_NCCL_VERSION=1.8.1-aws ENV NCCL_TESTS_VERSION=master +## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and +# nccl>=2.19.0. See https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/efa-cheatsheet.md +#ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0 + RUN apt-get update -y RUN apt-get remove -y --allow-change-held-packages \ libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1