From 0e541bcd4072c50411ac3581ffcb7404b3c3e1a7 Mon Sep 17 00:00:00 2001 From: Alex Iankoulski Date: Mon, 22 Jul 2024 15:48:25 -0700 Subject: [PATCH] Modified PR to reflect feedback from reviewers --- patterns/nvidia-gpu-efa/README.md | 39 ++++------ .../nvidia-gpu-efa/generate-efa-nccl-test.sh | 72 ++++++++----------- patterns/nvidia-gpu-efa/main.tf | 2 +- 3 files changed, 46 insertions(+), 67 deletions(-) diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md index 6315eeee8d..ec38555674 100644 --- a/patterns/nvidia-gpu-efa/README.md +++ b/patterns/nvidia-gpu-efa/README.md @@ -1,6 +1,6 @@ -# EKS Cluster w/ NVIDIA GPUs and EFA for AI/ML Workloads +# EKS Cluster w/ NVIDIA GPUs and EFA for Machine Learning -This pattern demonstrates an Amazon EKS Cluster with an EFA-enabled nodegroup that utilizes `p5.48xlarge` instances with H100 NVIDIA GPUs used in distributed, multi-node AI/ML workloads. +This pattern demonstrates an Amazon EKS Cluster with an EFA-enabled nodegroup that utilizes `p5.48xlarge` instances with H100 NVIDIA GPUs used in distributed, multi-node machine learning. The following components are demonstrated in this pattern: @@ -17,13 +17,13 @@ The following components are demonstrated in this pattern: ## Code -The code consists of the following files: - -[eks.tf](eks.tf) - template of the EKS cluster, containing a default node group and nvidia-efa node group. - -[helm.tf](helm.tf) - helm charts to install [nvidia-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) and [efa-device-plugin](https://github.com/aws-samples/aws-efa-eks) +```terraform hl_lines="24-26 32-67" +{% include "../../patterns/nvidia-gpu-efa/eks.tf" %} +``` -[main.tf](main.tf) - main project template, includes [`eks.tf`](eks.tf) and [`helm.tf`](helm.tf) +```terraform hl_lines="5-47" +{% include "../../patterns/nvidia-gpu-efa/helm.tf" %} +``` ## Deploy @@ -31,7 +31,7 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started ## Validate -Note: +!!! note Desired instance type can be specified in [eks.tf](eks.tf#L36). Values shown below will change based on the instance type selected (i.e. - `p5.48xlarge` has 8 GPUs and 32 EFA interfaces). @@ -39,7 +39,7 @@ Note: If you are using an on-demand capacity reservation (ODCR) for your instance type, please uncomment the `capacity_reservation_specification` block in `eks.tf` and specify a capacity_reservation_id. Please ensure that the region and availability zone of your ODCR match the ones used in `main.tf`. -1. List the nodes with instance type details: +1. List the nodes and their instance type: ```sh kubectl get nodes -L node.kubernetes.io/instance-type @@ -91,13 +91,7 @@ Note: 3. EFA info test This test prints a list of available EFA interfaces by using the `/opt/amazon/efa/bin/fi_info` utility. - Edit the script [generate-efa-info-test.sh](generate-efa-info-test.sh) and adjust the following environment variables if needed prior to running it: - - ```sh - export NUM_WORKERS=2 - export GPU_PER_WORKER=8 - export EFA_PER_WORKER=32 - ``` + The script [generate-efa-info-test.sh](generate-efa-info-test.sh) creates an MPIJob manifest file named `efa-info-test.yaml`. It assumes that there are two cluster nodes with 8 GPU's per node and 32 EFA adapters. If you are not using `p5.48xlarge` instances in your cluster, you may adjust the settings in the script prior to running it. `NUM_WORKERS` - number of nodes you want to run the test on `GPU_PER_WORKER` - number of GPUs available on each node @@ -107,15 +101,13 @@ Note: ./generate-efa-info-test.sh ``` - This script generates an MPIJob manifest file named `efa-info-test.yaml`. - To start the test apply the generated manifest to the cluster: ```sh kubectl apply -f ./efa-info-test.yaml ``` - ```log + ```text mpijob.kubeflow.org/efa-info-test created ``` @@ -180,9 +172,8 @@ Note: 4. EFA NCCL test - The EFA NCCL test is used to measure network bandwidth by running the `/opt/nccl-tests/build/all_reduce_perf` utility. - Please edit the script below and modify the environment variables at the beginning of the script as needed. - Then generate the MPIjob manifest. + The EFA NCCL test is used to measure network bandwidth by running the `/opt/nccl-tests/build/all_reduce_perf` utility. + Create an MPIjob manifest by executing the script below: ```sh ./generate-efa-nccl-test.sh @@ -205,7 +196,7 @@ Note: kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) ``` - ```log + ```text ... [1,0]:# out-of-place in-place [1,0]:# size count type redop root time algbw busbw #wrong time algbw busbw #wrong diff --git a/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh index f16327ac60..5a376f0799 100755 --- a/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh +++ b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh @@ -8,16 +8,18 @@ export GPU_PER_WORKER=8 export EFA_PER_WORKER=32 export TOTAL_GPUS=$((${NUM_WORKERS}*${GPU_PER_WORKER})) -export NCCL_NVLS_ENABLE=1 -export NCCL_PROTO=LL,LL128,Simple -export NCCL_ALGO=Ring export FI_PROVIDER=efa export FI_EFA_USE_DEVICE_RDMA=1 -export RDMAV_FORK_SAFE=1 -export NCCL_SHM_DISABLE=0 +export FI_EFA_FORK_SAFE=1 + +export NCCL_DEBUG=WARN +export NCCL_BUFFSIZE=8388608 +export NCCL_P2P_NET_CHUNKSIZE=524288 export HUGEPAGES_2MI=5120Mi -export MEMORY=10000Mi +export MEMORY=32000Mi + +export DOLLAR='$' cat <> efa-nccl-test.yaml @@ -30,7 +32,6 @@ spec: cleanPodPolicy: Running backoffLimit: 20 slotsPerWorker: ${GPU_PER_WORKER} - mpiImplementation: "OpenMPI" mpiReplicaSpecs: Launcher: replicas: 1 @@ -43,67 +44,54 @@ spec: imagePullPolicy: Always env: - name: LD_LIBRARY_PATH - value: "/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib" + value: "/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:${DOLLAR}LD_LIBRARY_PATH" - name: PATH - value: "/opt/amazon/efa/bin:/usr/bin" - - name: XLA_FLAGS - value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" - - name: TF_XLA_FLAGS - value: "--tf_xla_cpu_global_jit" - - name: NCCL_DEBUG - value: INFO + value: "${DOLLAR}PATH:/opt/amazon/efa/bin:/usr/bin" command: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - - --oversubscribe - --tag-output - #- -N - #- "8" + - -N + - "${GPU_PER_WORKER}" - -np - "${TOTAL_GPUS}" - -bind-to - none - - -map-by - - slot - -x - PATH - -x - LD_LIBRARY_PATH - -x - - XLA_FLAGS + - FI_PROVIDER=${FI_PROVIDER} + - -x + - FI_EFA_USE_DEVICE_RDMA=${FI_EFA_USE_DEVICE_RDMA} + - -x + - FI_EFA_FORK_SAFE=${FI_EFA_FORK_SAFE} + - -x + - NCCL_DEBUG=${NCCL_DEBUG} - -x - - TF_XLA_FLAGS + - NCCL_BUFFSIZE=${NCCL_BUFFSIZE} - -x - - NCCL_DEBUG=INFO - #- -x - #- NCCL_NVLS_ENABLE=${NCCL_NVLS_ENABLE} - #- -x - #- NCCL_PROTO=${NCCL_PROTO} - #- -x - #- NCCL_ALGO=${NCCL_ALGO} - #- -x - #- FI_PROVIDER=${FI_PROVIDER} - #- -x - #- FI_EFA_USE_DEVICE_RDMA=${FI_EFA_USE_DEVICE_RDMA} - #- -x - #- RDMAV_FORK_SAFE=${RDMAV_FORK_SAFE} - #- -x - #- NCCL_SHM_DISABLE=${NCCL_SHM_DISABLE} + - NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE} - --mca - pml - - ^cm + - ^cm,ucx + - --mca + - btl + - tcp,self + - --mca + - btl_tcp_if_exclude + - lo,docker0,veth_def_agent - --mca - plm_rsh_agent - ssh - /opt/nccl-tests/build/all_reduce_perf - -b - - "1" + - "8" - -e - - 2G + - "16G" - -f - "2" - - -t - - "1" - -g - "1" - -c diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf index c67ee3eac9..03b3fced85 100644 --- a/patterns/nvidia-gpu-efa/main.tf +++ b/patterns/nvidia-gpu-efa/main.tf @@ -46,7 +46,7 @@ data "aws_availability_zones" "available" {} locals { name = basename(path.cwd) - region = "us-east-2" + region = "us-west-2" vpc_cidr = "10.0.0.0/16" azs = slice(data.aws_availability_zones.available.names, 0, 3)