Modified PR to reflect feedback from reviewers

iankouls-aws · Jul 22, 2024 · 0e541bc · 0e541bc
1 parent 12f8ad7
commit 0e541bc
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 67 deletions.
diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md
@@ -1,6 +1,6 @@
-# EKS Cluster w/ NVIDIA GPUs and EFA for AI/ML Workloads
+# EKS Cluster w/ NVIDIA GPUs and EFA for Machine Learning
 
-This pattern demonstrates an Amazon EKS Cluster with an EFA-enabled nodegroup that utilizes `p5.48xlarge` instances with H100 NVIDIA GPUs used in distributed, multi-node AI/ML workloads.
+This pattern demonstrates an Amazon EKS Cluster with an EFA-enabled nodegroup that utilizes `p5.48xlarge` instances with H100 NVIDIA GPUs used in distributed, multi-node machine learning.
 
 The following components are demonstrated in this pattern:
 
@@ -17,29 +17,29 @@ The following components are demonstrated in this pattern:
 
 ## Code
 
-The code consists of the following files:
-
-[eks.tf](eks.tf) - template of the EKS cluster, containing a default node group and nvidia-efa node group.
-
-[helm.tf](helm.tf) - helm charts to install [nvidia-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) and [efa-device-plugin](https://github.com/aws-samples/aws-efa-eks)
+```terraform hl_lines="24-26 32-67"
+{% include  "../../patterns/nvidia-gpu-efa/eks.tf" %}
+```
 
-[main.tf](main.tf) - main project template, includes [`eks.tf`](eks.tf) and [`helm.tf`](helm.tf)
+```terraform hl_lines="5-47"
+{% include  "../../patterns/nvidia-gpu-efa/helm.tf" %}
+```
 
 ## Deploy
 
 See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#prerequisites) for the prerequisites and steps to deploy this pattern.
 
 ## Validate
 
-Note:
+!!! note
 
     Desired instance type can be specified in [eks.tf](eks.tf#L36). 
     Values shown below will change based on the instance type selected (i.e. - `p5.48xlarge` has 8 GPUs and 32 EFA interfaces).
     A list of EFA-enabled instance types is available [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types).
     If you are using an on-demand capacity reservation (ODCR) for your instance type, please uncomment the `capacity_reservation_specification` block in `eks.tf`
     and specify a capacity_reservation_id. Please ensure that the region and availability zone of your ODCR match the ones used in `main.tf`.
 
-1. List the nodes with instance type details:
+1. List the nodes and their instance type:
 
     ```sh
     kubectl get nodes -L node.kubernetes.io/instance-type
@@ -91,13 +91,7 @@ Note:
 3. EFA info test
 
     This test prints a list of available EFA interfaces by using the `/opt/amazon/efa/bin/fi_info` utility.
-    Edit the script [generate-efa-info-test.sh](generate-efa-info-test.sh) and adjust the following environment variables if needed prior to running it:
-
-    ```sh
-    export NUM_WORKERS=2
-    export GPU_PER_WORKER=8
-    export EFA_PER_WORKER=32
-    ```
+    The script [generate-efa-info-test.sh](generate-efa-info-test.sh) creates an MPIJob manifest file named `efa-info-test.yaml`. It assumes that there are two cluster nodes with 8 GPU's per node and 32 EFA adapters. If you are not using `p5.48xlarge` instances in your cluster, you may adjust the settings in the script prior to running it.
     
     `NUM_WORKERS` - number of nodes you want to run the test on
     `GPU_PER_WORKER` - number of GPUs available on each node
@@ -107,15 +101,13 @@ Note:
     ./generate-efa-info-test.sh
     ```
     
-    This script generates an MPIJob manifest file named `efa-info-test.yaml`.
-
     To start the test apply the generated manifest to the cluster:
 
     ```sh
     kubectl apply -f ./efa-info-test.yaml
     ```
 
-    ```log
+    ```text
     mpijob.kubeflow.org/efa-info-test created
     ```    
 
@@ -180,9 +172,8 @@ Note:
 
 4. EFA NCCL test
 
-    The EFA NCCL test is used to measure network bandwidth by running the `/opt/nccl-tests/build/all_reduce_perf` utility. 
-    Please edit the script below and modify the environment variables at the beginning of the script as needed. 
-    Then generate the MPIjob manifest.
+    The EFA NCCL test is used to measure network bandwidth by running the `/opt/nccl-tests/build/all_reduce_perf` utility.  
+    Create an MPIjob manifest by executing the script below:
     
     ```sh
     ./generate-efa-nccl-test.sh
@@ -205,7 +196,7 @@ Note:
     kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1)
     ```
 
-    ```log
+    ```text
     ...
     [1,0]<stdout>:#                                                              out-of-place                       in-place          
     [1,0]<stdout>:#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong

diff --git a/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh
@@ -8,16 +8,18 @@ export GPU_PER_WORKER=8
 export EFA_PER_WORKER=32
 export TOTAL_GPUS=$((${NUM_WORKERS}*${GPU_PER_WORKER}))
 
-export NCCL_NVLS_ENABLE=1
-export NCCL_PROTO=LL,LL128,Simple
-export NCCL_ALGO=Ring
 export FI_PROVIDER=efa
 export FI_EFA_USE_DEVICE_RDMA=1
-export RDMAV_FORK_SAFE=1
-export NCCL_SHM_DISABLE=0
+export FI_EFA_FORK_SAFE=1
+
+export NCCL_DEBUG=WARN
+export NCCL_BUFFSIZE=8388608
+export NCCL_P2P_NET_CHUNKSIZE=524288
 
 export HUGEPAGES_2MI=5120Mi
-export MEMORY=10000Mi
+export MEMORY=32000Mi
+
+export DOLLAR='$'
 
 
 cat <<EOF >> efa-nccl-test.yaml
@@ -30,7 +32,6 @@ spec:
     cleanPodPolicy: Running
     backoffLimit: 20
   slotsPerWorker: ${GPU_PER_WORKER}
-  mpiImplementation: "OpenMPI"
   mpiReplicaSpecs:
     Launcher:
       replicas: 1
@@ -43,67 +44,54 @@ spec:
             imagePullPolicy: Always
             env:
              - name: LD_LIBRARY_PATH
-               value: "/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib"
+               value: "/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:${DOLLAR}LD_LIBRARY_PATH"
              - name: PATH
-               value: "/opt/amazon/efa/bin:/usr/bin"
-             - name: XLA_FLAGS
-               value: "--xla_gpu_cuda_data_dir=/usr/local/cuda"
-             - name: TF_XLA_FLAGS
-               value: "--tf_xla_cpu_global_jit"
-             - name: NCCL_DEBUG
-               value: INFO
+               value: "${DOLLAR}PATH:/opt/amazon/efa/bin:/usr/bin"
             command:
             - /opt/amazon/openmpi/bin/mpirun
             - --allow-run-as-root
-            - --oversubscribe
             - --tag-output
-            #- -N
-            #- "8"
+            - -N
+            - "${GPU_PER_WORKER}"
             - -np
             - "${TOTAL_GPUS}"
             - -bind-to
             - none
-            - -map-by
-            - slot
             - -x
             - PATH
             - -x
             - LD_LIBRARY_PATH
             - -x
-            - XLA_FLAGS
+            - FI_PROVIDER=${FI_PROVIDER}
+            - -x
+            - FI_EFA_USE_DEVICE_RDMA=${FI_EFA_USE_DEVICE_RDMA}
+            - -x
+            - FI_EFA_FORK_SAFE=${FI_EFA_FORK_SAFE}
+            - -x
+            - NCCL_DEBUG=${NCCL_DEBUG}
             - -x
-            - TF_XLA_FLAGS
+            - NCCL_BUFFSIZE=${NCCL_BUFFSIZE}
             - -x
-            - NCCL_DEBUG=INFO
-            #- -x
-            #- NCCL_NVLS_ENABLE=${NCCL_NVLS_ENABLE}
-            #- -x
-            #- NCCL_PROTO=${NCCL_PROTO}
-            #- -x
-            #- NCCL_ALGO=${NCCL_ALGO}
-            #- -x
-            #- FI_PROVIDER=${FI_PROVIDER}
-            #- -x
-            #- FI_EFA_USE_DEVICE_RDMA=${FI_EFA_USE_DEVICE_RDMA}
-            #- -x
-            #- RDMAV_FORK_SAFE=${RDMAV_FORK_SAFE}
-            #- -x
-            #- NCCL_SHM_DISABLE=${NCCL_SHM_DISABLE}
+            - NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE}
             - --mca
             - pml
-            - ^cm
+            - ^cm,ucx
+            - --mca
+            - btl
+            - tcp,self
+            - --mca
+            - btl_tcp_if_exclude
+            - lo,docker0,veth_def_agent
             - --mca
             - plm_rsh_agent
             - ssh
             - /opt/nccl-tests/build/all_reduce_perf
             - -b
-            - "1"
+            - "8"
             - -e
-            - 2G
+            - "16G"
             - -f
             - "2"
-            - -t
-            - "1"
             - -g
             - "1"
             - -c

diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf
@@ -46,7 +46,7 @@ data "aws_availability_zones" "available" {}
 
 locals {
   name   = basename(path.cwd)
-  region = "us-east-2"
+  region = "us-west-2"
 
   vpc_cidr = "10.0.0.0/16"
   azs      = slice(data.aws_availability_zones.available.names, 0, 3)