From 228c59cf9de709638054f5de6ddb8bcc267e3a26 Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Fri, 12 Apr 2024 10:16:13 +0800
Subject: [PATCH 1/2] efa: document the precise lib versions that necessitate
 cuda sync mem env var

---
 1.architectures/efa-cheatsheet.md | 70 +++++++++++++++----------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/1.architectures/efa-cheatsheet.md b/1.architectures/efa-cheatsheet.md
index 90cbe2c2..83362089 100644
--- a/1.architectures/efa-cheatsheet.md
+++ b/1.architectures/efa-cheatsheet.md
@@ -5,23 +5,23 @@
 For optimized performance, you may need to set additional environment variables depending on the
 versions of your libfabric.
 
-| Setting                        | Explanation                                                                                                                                                                                                                                                                                                                                           |
-| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `FI_EFA_USE_HUGE_PAGE=0`       | Set to 0 when you see `os.fork()` causes `OSError: Cannot allocate memory`. Typically happen by multi-process PyTorch data loader. Disabling huge page causes minor performance hit, but it's needed to prevent fork fails due to the operating system running out of huge pages.                                                                     |
-| `FI_EFA_FORK_SAFE=1`           | Not needed for kernel>=5.15. Still fine to set it though no effect. See [ref](https://github.com/ofiwg/libfabric/pull/9112).                                                                                                                                                                                                                          |
-| `FI_EFA_USE_DEVICE_RDMA=1`     | Do not set for libfabric>=1.18.0 and aws-ofi-nccl>=1.7.0. It's not harmful to set it on p4/p5 on the newer software, but you just don't have to set it.                                                                                                                                                                                               |
-| `FI_EFA_SET_CUDA_SYNC_MEMOPS`         | Set this to `0` if you see the error `register_rail_mr_buffer:617 NCCL WARN NET/OFI Unable to register memory (type = 2) for device 4. RC: -22, Error: Invalid argument`. |
-| `FI_EFA_ENABLE_SHM_TRANSFER=1` | Not needed. This is really a no-op, the default already to enable SHMEM                                                                                                                                                                                                                                                                               |
-| `FI_PROVIDER=efa`              | Use for aws-ofi-nccl<=1.5.0 AND p4/p5 instances.                                                                                                                                                                                                                                                                                                      |
-| `NCCL_PROTO=simple`            | Use for aws-ofi-nccl<=1.5.0 and p4/p5 instances.                                                                                                                                                                                                                                                                                                      |
-| `NCCL_SOCKET_NTHREADS`         | Not applicable for EFA.                                                                                                                                                                                                                                                                                                                               |
-| `NCCL_SOCKET_IFNAME`         | Set this to `en` to cover both `p5.48xlarge` and `p4d(e).24xlarge`. For other instances check `ifconfig` to see the active network interface. |
-| `NCCL_NSOCKS_PERTHREAD`        | Not applicable for EFA.                                                                                                                                                                                                                                                                                                                               |
-| `NCCL_MIN_CHANNELS=xxx`        | Recommend to leave it out to use the default. For e.g., on p4d/p4de, the number of channels should be 8, which is the minimum for a 4-NIC platform. The reduction message is split by number of GPUs in the job, then the number of channels, so having more channels than necessary causes smaller messages which causes EFA to be starved for data. |
-| `NCCL_BUFFSIZE=xxx`            | Recommend to leave it out to use the default.                                                                                                                                                                                                                                                                                                         |
-| `RDMAV_FORK_SAFE=1`            | Do not use. This is a RDMA-core environment variable. Prefer `FI_EFA_FORK_SAFE` (if it still makes sense for your Linux kernel version). The two looks the same, but actually behaves very differently, especially on newer kernels, where `RDMAV_FORK_SAFE=1` can break things.                                                                      |
-| `RDMAV_*`                      | Do not use                                                                                                                                                                                                                                                                                                                                            |
-| NCCL version                   | Recommend one of the stable releases.                                                                                                                                                                                                                                                                                                                 |
+| Setting                         | Explanation                                                                                                                                                                                                                                                                                                                                           |
+| ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `FI_EFA_USE_HUGE_PAGE=0`        | Set to 0 when you see `os.fork()` causes `OSError: Cannot allocate memory`. Typically happen by multi-process PyTorch data loader. Disabling huge page causes minor performance hit, but it's needed to prevent fork fails due to the operating system running out of huge pages.                                                                     |
+| `FI_EFA_FORK_SAFE=1`            | Not needed for kernel>=5.15. Still fine to set it though no effect. See [ref](https://github.com/ofiwg/libfabric/pull/9112).                                                                                                                                                                                                                          |
+| `FI_EFA_USE_DEVICE_RDMA=1`      | Do not set for libfabric>=1.18.0 and aws-ofi-nccl>=1.7.0. It's not harmful to set it on p4/p5 on the newer software, but you just don't have to set it.                                                                                                                                                                                               |
+| `FI_EFA_SET_CUDA_SYNC_MEMOPS=0` | Set this on `efa-installer<1.29.1` and `nccl>=2.19.0` to prevent NCCL error `register_rail_mr_buffer:617 NCCL WARN NET/OFI Unable to register memory (type = 2) for device 4. RC: -22, Error: Invalid argument`.                                                                                                                                      |
+| `FI_EFA_ENABLE_SHM_TRANSFER=1`  | Not needed. This is really a no-op, the default already to enable SHMEM                                                                                                                                                                                                                                                                               |
+| `FI_PROVIDER=efa`               | Use for aws-ofi-nccl<=1.5.0 AND p4/p5 instances.                                                                                                                                                                                                                                                                                                      |
+| `NCCL_PROTO=simple`             | Use for aws-ofi-nccl<=1.5.0 and p4/p5 instances.                                                                                                                                                                                                                                                                                                      |
+| `NCCL_SOCKET_NTHREADS`          | Not applicable for EFA.                                                                                                                                                                                                                                                                                                                               |
+| `NCCL_SOCKET_IFNAME`            | Set this to `en` to cover both `p5.48xlarge` and `p4d(e).24xlarge`. For other instances check `ifconfig` to see the active network interface.                                                                                                                                                                                                         |
+| `NCCL_NSOCKS_PERTHREAD`         | Not applicable for EFA.                                                                                                                                                                                                                                                                                                                               |
+| `NCCL_MIN_CHANNELS=xxx`         | Recommend to leave it out to use the default. For e.g., on p4d/p4de, the number of channels should be 8, which is the minimum for a 4-NIC platform. The reduction message is split by number of GPUs in the job, then the number of channels, so having more channels than necessary causes smaller messages which causes EFA to be starved for data. |
+| `NCCL_BUFFSIZE=xxx`             | Recommend to leave it out to use the default.                                                                                                                                                                                                                                                                                                         |
+| `RDMAV_FORK_SAFE=1`             | Do not use. This is a RDMA-core environment variable. Prefer `FI_EFA_FORK_SAFE` (if it still makes sense for your Linux kernel version). The two looks the same, but actually behaves very differently, especially on newer kernels, where `RDMAV_FORK_SAFE=1` can break things.                                                                      |
+| `RDMAV_*`                       | Do not use                                                                                                                                                                                                                                                                                                                                            |
+| NCCL version                    | Recommend one of the stable releases.                                                                                                                                                                                                                                                                                                                 |
 
 ## 2. A word on p5.48xlarge instances
 
@@ -30,29 +30,30 @@ Use cuda>=12.0, nccl>=2.18.0 (recommend at least 2.18.5), aws-ofi-nccl>=1.7.2 (r
 
 The table below shows number of NVLinks for `p4de.24xlarge` and `p5.48xlarge` instances:
 
-|   Instance  |    GPU     | # NVLinks | Generation |
-|:-----------:|:----------:|:---------:| :---------:| 
-|p4de.24xlarge|  A100 80GB |     12    |     3rd    |  
-| p5.48xlarge |     H100   |     18    |     4th    |
+|   Instance    |    GPU    | # NVLinks | Generation |
+| :-----------: | :-------: | :-------: | :--------: |
+| p4de.24xlarge | A100 80GB |    12     |    3rd     |
+|  p5.48xlarge  |   H100    |    18     |    4th     |
 
 `nvidia-smi nvlink -s`  is the command to get the status for all NVLinks for each of the GPUs. Below we see this data for GPU 0 of a `p4de.24xlarge` instance
 
 ```bash
 ubuntu@ip-172-31-35-99:~$ nvidia-smi nvlink -s
 GPU 0: NVIDIA A100-SXM4-80GB (UUID: GPU-370ec676-e407-3115-836a-8ebcb3c4f62a)
-	 Link 0: 25 GB/s
-	 Link 1: 25 GB/s
-	 Link 2: 25 GB/s
-	 Link 3: 25 GB/s
-	 Link 4: 25 GB/s
-	 Link 5: 25 GB/s
-	 Link 6: 25 GB/s
-	 Link 7: 25 GB/s
-	 Link 8: 25 GB/s
-	 Link 9: 25 GB/s
-	 Link 10: 25 GB/s
-	 Link 11: 25 GB/s
+  Link 0: 25 GB/s
+  Link 1: 25 GB/s
+  Link 2: 25 GB/s
+  Link 3: 25 GB/s
+  Link 4: 25 GB/s
+  Link 5: 25 GB/s
+  Link 6: 25 GB/s
+  Link 7: 25 GB/s
+  Link 8: 25 GB/s
+  Link 9: 25 GB/s
+  Link 10: 25 GB/s
+  Link 11: 25 GB/s
 ```
+
 The [dcgm](https://github.com/NVIDIA/DCGM?tab=readme-ov-file) command to validate the NVLinks is `sudo dcgmi diag -r 2 -p pcie.gpu_nvlinks_expected_up=<# NVLinks>`. For `p4de.24xlarge` instance, this diagnostic looks like:
 
 ```bash
@@ -81,8 +82,7 @@ Successfully ran diagnostic for group.
 | GPU Memory                | Pass - All                                     |
 +-----  Stress  ------------+------------------------------------------------+
 +---------------------------+------------------------------------------------+
-```  
-  
+```
 
 ## 3. Sample Presets
 

From 5086512e7046c9fe1ce3cbd5c1712a610e73027a Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Fri, 12 Apr 2024 10:23:45 +0800
Subject: [PATCH 2/2] Update template pytorch dockerfile with cuda sync memops
 stanza

---
 .../containers/pytorch/0.nvcr-pytorch-aws.dockerfile        | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
index 6f446e3e..49ab8c03 100644
--- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
+++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
@@ -23,11 +23,15 @@ FROM nvcr.io/nvidia/pytorch:23.12-py3
 ENV DEBIAN_FRONTEND=noninteractive
 
 # The three must-be-built packages.
-# Efa-installer>=1.29.0 required for nccl>=2.19.0 to avoid libfabric NCCL error.
+# Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error.
 ENV EFA_INSTALLER_VERSION=1.30.0
 ENV AWS_OFI_NCCL_VERSION=1.8.1-aws
 ENV NCCL_TESTS_VERSION=master
 
+## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and
+# nccl>=2.19.0. See https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/efa-cheatsheet.md
+#ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
+
 RUN apt-get update -y
 RUN apt-get remove -y --allow-change-held-packages \
                       libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1