From d35ac91a99674a39805249491dcddc461d6e67fe Mon Sep 17 00:00:00 2001 From: Alex Iankoulski Date: Fri, 21 Jun 2024 06:11:47 +0000 Subject: [PATCH] Update nvidia-gpu-efa template --- patterns/nvidia-gpu-efa/.gitignore | 2 + patterns/nvidia-gpu-efa/README.md | 198 +++++++++++++----- patterns/nvidia-gpu-efa/eks.tf | 9 + .../nvidia-gpu-efa/generate-efa-info-test.sh | 93 ++++++++ .../nvidia-gpu-efa/generate-efa-nccl-test.sh | 141 +++++++++++++ patterns/nvidia-gpu-efa/main.tf | 2 +- 6 files changed, 395 insertions(+), 50 deletions(-) create mode 100644 patterns/nvidia-gpu-efa/.gitignore create mode 100755 patterns/nvidia-gpu-efa/generate-efa-info-test.sh create mode 100755 patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh diff --git a/patterns/nvidia-gpu-efa/.gitignore b/patterns/nvidia-gpu-efa/.gitignore new file mode 100644 index 0000000000..4ae0543912 --- /dev/null +++ b/patterns/nvidia-gpu-efa/.gitignore @@ -0,0 +1,2 @@ +efa-info-test.yaml +efa-nccl-test.yaml diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md index b352a19863..ec9a4f5617 100644 --- a/patterns/nvidia-gpu-efa/README.md +++ b/patterns/nvidia-gpu-efa/README.md @@ -1,6 +1,6 @@ -# EKS Cluster w/ NVIDIA GPUs and EFA for Machine Learning +# EKS Cluster w/ NVIDIA GPUs and EFA for AI/ML Workloads -This pattern demonstrates an Amazon EKS Cluster with an EFA-enabled nodegroup that utilizes `p5.48xlarge` instances with H100 NVIDIA GPUs used in distributed, multi-node machine learning workloads. +This pattern demonstrates an Amazon EKS Cluster with an EFA-enabled nodegroup that utilizes `p5.48xlarge` instances with H100 NVIDIA GPUs used in distributed, multi-node AI/ML workloads. The following components are demonstrated in this pattern: @@ -31,24 +31,30 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started ## Validate -!!! note +Note: - The following steps are shown with `g5.8xlarge` for frugality. Values shown below will change based on the instance type selected (i.e. - `p5.48xlarge` has 8 GPUs and 32 EFA interfaces) + Desired instance type can be specified in [eks.tf](eks.tf#L36). + Values shown below will change based on the instance type selected (i.e. - `p5.48xlarge` has 8 GPUs and 32 EFA interfaces). + A list of EFA-enabled instance types is available [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types). + If you are using an on-demand capacity reservation (ODCR) for your instance type, please uncomment the `capacity_reservation_specification` block in `eks.tf` + and specify a capacity_reservation_id. Please ensure that the region and availability zone of your ODCR match the ones used in `main.tf`. -1. List the nodes by instance type: +1. List the nodes with instance type details: ```sh - kubectl get nodes -o yaml | grep instance-type | grep node | grep -v f: + kubectl get nodes -L node.kubernetes.io/instance-type ``` ```text - node.kubernetes.io/instance-type: g5.8xlarge - node.kubernetes.io/instance-type: m5.large - node.kubernetes.io/instance-type: m5.large - node.kubernetes.io/instance-type: g5.8xlarge + NAME STATUS ROLES AGE VERSION INSTANCE-TYPE + ip-10-0-1-16.us-east-2.compute.internal Ready 12h v1.29.3-eks-ae9a62a p5.48xlarge + ip-10-0-12-113.us-east-2.compute.internal Ready 14h v1.29.3-eks-ae9a62a m5.large + ip-10-0-12-201.us-east-2.compute.internal Ready 12h v1.29.3-eks-ae9a62a p5.48xlarge + ip-10-0-46-217.us-east-2.compute.internal Ready 14h v1.29.3-eks-ae9a62a m5.large + ``` - You should see two EFA-enabled (in this example `g5.8xlarge`) nodes in the list. + You should see two EFA-enabled (in this example `p5.48xlarge`) nodes in the list. 2. Deploy Kubeflow MPI Operator @@ -56,7 +62,7 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started To deploy the MPI operator execute the following: ```sh - kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.3.0/deploy/v2beta1/mpi-operator.yaml + kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.4.0/deploy/v2beta1/mpi-operator.yaml ``` ```text @@ -82,80 +88,174 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started clusterrole.rbac.authorization.k8s.io/mpi-operator configured ``` -3. EFA test +3. EFA info test - The results should shown that two EFA adapters are available (one for each worker pod) + This test prints a list of available EFA interfaces by using the `/opt/amazon/efa/bin/fi_info` utility. + Edit the script [generate-efa-info-test.sh](generate-efa-info-test.sh) and adjust the following environment variables if needed prior to running it: + + ```sh + export NUM_WORKERS=2 + export GPU_PER_WORKER=8 + export EFA_PER_WORKER=32 + ``` + + `NUM_WORKERS` - number of nodes you want to run the test on + `GPU_PER_WORKER` - number of GPUs available on each node + `EFA_PER_WORKER` - number of EFA interfaces available on each node + + ```sh + ./generate-efa-info-test.sh + ``` + + This script generates an MPIJob manifest file named `efa-info-test.yaml`. + + To start the test apply the generated manifest to the cluster: ```sh - kubectl apply -f https://raw.githubusercontent.com/aws-samples/aws-do-eks/main/Container-Root/eks/deployment/efa-device-plugin/test-efa.yaml + kubectl apply -f ./efa-info-test.yaml ``` - ```text + ```log mpijob.kubeflow.org/efa-info-test created + ``` + + Observe the pods in the current namespace. You should see a launcher pod and worker pods. + It is normal for the launcher pod to restart a few times until the worker pods are fully running. + + ```sh + watch kubectl get pods ``` - Once the test launcher pod enters status `Running` or `Completed`, see the test logs using the command below: + ```log + NAME READY STATUS RESTARTS AGE + efa-info-test-launcher-wm8pm 0/1 CrashLoopBackOff 1 (16s ago) 19s + efa-info-test-worker-0 1/1 Running 0 19s + efa-info-test-worker-1 1/1 Running 0 19s + ``` + + ```log + NAME READY STATUS RESTARTS AGE + efa-info-test-launcher-wm8pm 1/1 Running 2 (18s ago) 21s + efa-info-test-worker-0 1/1 Running 0 21s + efa-info-test-worker-1 1/1 Running 0 21s + ``` + + ```log + NAME READY STATUS RESTARTS AGE + efa-info-test-launcher-wm8pm 0/1 Completed 2 5m20s + ``` + + Once the test launcher pod enters status `Running` or `Completed`, + see the test logs using the command below: ```sh kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) ``` - ```text - Warning: Permanently added 'efa-info-test-worker-1.efa-info-test-worker.default.svc,10.11.13.224' (ECDSA) to the list of known hosts. - Warning: Permanently added 'efa-info-test-worker-0.efa-info-test-worker.default.svc,10.11.4.63' (ECDSA) to the list of known hosts. + ```log + Warning: Permanently added 'efa-info-test-worker-1.efa-info-test.default.svc' (ED25519) to the list of known hosts. + Warning: Permanently added 'efa-info-test-worker-0.efa-info-test.default.svc' (ED25519) to the list of known hosts. [1,1]:provider: efa [1,1]: fabric: efa - [1,1]: domain: rdmap197s0-rdm - [1,1]: version: 116.10 + [1,1]: domain: rdmap79s0-rdm + [1,1]: version: 120.10 [1,1]: type: FI_EP_RDM [1,1]: protocol: FI_PROTO_EFA + + ... + [1,0]:provider: efa [1,0]: fabric: efa - [1,0]: domain: rdmap197s0-rdm - [1,0]: version: 116.10 + [1,0]: domain: rdmap201s0-rdm + [1,0]: version: 120.10 [1,0]: type: FI_EP_RDM [1,0]: protocol: FI_PROTO_EFA ``` -4. EFA NCCL test + Finally, remove the job: + + ```sh + kubectl delete -f ./efa-info-test.yaml + ``` - To run the EFA NCCL test please execute the following kubectl command: +4. EFA NCCL test + The EFA NCCL test is used to measure network bandwidth by running the `/opt/nccl-tests/build/all_reduce_perf` utility. + Please edit the script below and modify the environment variables at the beginning of the script as needed. + Then generate the MPIjob manifest. + ```sh - kubectl apply -f https://raw.githubusercontent.com/aws-samples/aws-do-eks/main/Container-Root/eks/deployment/efa-device-plugin/test-nccl-efa.yaml + ./generate-efa-nccl-test.sh ``` + + This script creates a file named `efa-nccl-test.yaml`. Apply the manifest to start the EFA nccl test. - ```text - mpijob.kubeflow.org/test-nccl-efa created - ``` + ```sh + kubectl apply -f ./efa-nccl-test.yaml - Once the launcher pod enters `Running` or `Completed` state, execute the following to see the test logs: + ```text + mpijob.kubeflow.org/efa-nccl-test created + ``` + Similarly to the EFA info test, a launcher and worker pods will be created. The launcher pod will be + in CrashLoopBackoff mode until the worker pods enter Running state. + As soon as the launcher pod enters Running state as well, execute the following command to see the test logs: + ```sh kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) ``` - ```text - [1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/OFI Selected Provider is efa (found 1 nics) - [1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO Using network AWS Libfabric - [1,0]:NCCL version 2.12.7+cuda11.4 + ```log + ... + [1,0]:# out-of-place in-place + [1,0]:# size count type redop root time algbw busbw #wrong time algbw busbw #wrong + [1,0]:# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + [1,0]: 0 0 float sum -1 0.13 0.00 0.00 0 0.12 0.00 0.00 0 + [1,0]: 0 0 float sum -1 0.12 0.00 0.00 0 0.12 0.00 0.00 0 + [1,0]: 4 1 float sum -1 65.43 0.00 0.00 0 65.82 0.00 0.00 0 + [1,0]: 8 2 float sum -1 64.86 0.00 0.00 0 65.67 0.00 0.00 0 + [1,0]: 16 4 float sum -1 64.72 0.00 0.00 0 64.83 0.00 0.00 0 + [1,0]: 32 8 float sum -1 65.47 0.00 0.00 0 65.16 0.00 0.00 0 + [1,0]: 64 16 float sum -1 65.34 0.00 0.00 0 65.58 0.00 0.00 0 + [1,0]: 128 32 float sum -1 65.99 0.00 0.00 0 66.28 0.00 0.00 0 + [1,0]: 256 64 float sum -1 75.81 0.00 0.01 0 66.76 0.00 0.01 0 + [1,0]: 512 128 float sum -1 69.43 0.01 0.01 0 67.18 0.01 0.01 0 + [1,0]: 1024 256 float sum -1 82.35 0.01 0.02 0 69.03 0.01 0.03 0 + [1,0]: 2048 512 float sum -1 72.49 0.03 0.05 0 71.37 0.03 0.05 0 + [1,0]: 4096 1024 float sum -1 77.47 0.05 0.10 0 77.42 0.05 0.10 0 + [1,0]: 8192 2048 float sum -1 78.10 0.10 0.20 0 78.01 0.11 0.20 0 + [1,0]: 16384 4096 float sum -1 93.35 0.18 0.33 0 80.11 0.20 0.38 0 + [1,0]: 32768 8192 float sum -1 106.6 0.31 0.58 0 96.22 0.34 0.64 0 + [1,0]: 65536 16384 float sum -1 120.6 0.54 1.02 0 89.06 0.74 1.38 0 + [1,0]: 131072 32768 float sum -1 93.62 1.40 2.62 0 106.3 1.23 2.31 0 + [1,0]: 262144 65536 float sum -1 111.5 2.35 4.41 0 111.6 2.35 4.41 0 + [1,0]: 524288 131072 float sum -1 121.2 4.33 8.11 0 109.9 4.77 8.94 0 + [1,0]: 1048576 262144 float sum -1 119.7 8.76 16.43 0 118.7 8.83 16.56 0 + [1,0]: 2097152 524288 float sum -1 143.9 14.58 27.33 0 144.2 14.55 27.28 0 + [1,0]: 4194304 1048576 float sum -1 163.7 25.62 48.03 0 163.6 25.64 48.08 0 + [1,0]: 8388608 2097152 float sum -1 195.3 42.95 80.54 0 194.9 43.03 80.69 0 + [1,0]: 16777216 4194304 float sum -1 278.6 60.22 112.91 0 279.9 59.94 112.38 0 + [1,0]: 33554432 8388608 float sum -1 459.7 73.00 136.87 0 433.9 77.34 145.01 0 + [1,0]: 67108864 16777216 float sum -1 587.2 114.29 214.29 0 587.1 114.31 214.34 0 + [1,0]: 134217728 33554432 float sum -1 926.6 144.85 271.60 0 851.5 157.63 295.55 0 + [1,0]: 268435456 67108864 float sum -1 1497.8 179.22 336.03 0 1496.0 179.44 336.45 0 + [1,0]: 536870912 134217728 float sum -1 2558.6 209.83 393.42 0 2560.8 209.65 393.10 0 + [1,0]: 1073741824 268435456 float sum -1 4553.6 235.80 442.13 0 4553.0 235.83 442.19 0 + [1,0]: 2147483648 536870912 float sum -1 9062.5 236.96 444.31 0 9060.4 237.02 444.41 0 + [1,0]:# Out of bounds values : 0 OK + [1,0]:# Avg bus bandwidth : 79.9352 + [1,0]:# ``` - Columns 8 and 12 in the output table show the in-place and out-of-place bus bandwidth calculated for the data size listed in column 1. In this case it is 3.13 and 3.12 GB/s respectively. - Your actual results may be slightly different. The calculated average bus bandwidth is displayed at the bottom of the log when the test finishes after it reaches the max data size, - specified in the mpijob manifest. In this result the average bus bandwidth is 1.15 GB/s. + Columns 9 and 13 in the output table show the in-place and out-of-place bus bandwidth calculated for the data size listed in column 2. + In this case it is at maximum 444.31 and 444.41 GB/s respectively. + Your actual results may be slightly different. The calculated average bus bandwidth is displayed at the end of the log. + In this test run the average bus bandwidth was 79.9352 GB/s. - ```text - [1,0]:# size count type redop root time algbw busbw #wrong time algbw busbw #wrong - [1,0]:# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) - ... - [1,0]: 262144 65536 float sum -1 195.0 1.34 1.34 0 194.0 1.35 1.35 0 - [1,0]: 524288 131072 float sum -1 296.9 1.77 1.77 0 291.1 1.80 1.80 0 - [1,0]: 1048576 262144 float sum -1 583.4 1.80 1.80 0 579.6 1.81 1.81 0 - [1,0]: 2097152 524288 float sum -1 983.3 2.13 2.13 0 973.9 2.15 2.15 0 - [1,0]: 4194304 1048576 float sum -1 1745.4 2.40 2.40 0 1673.2 2.51 2.51 0 - ... - [1,0]:# Avg bus bandwidth : 1.15327 + Lastly, delete the MPIJob: + + ```sh + kubectl delete -f ./efa-nccl-test.yaml ``` ## Destroy diff --git a/patterns/nvidia-gpu-efa/eks.tf b/patterns/nvidia-gpu-efa/eks.tf index 9193ce6bb3..9b2a89edf0 100644 --- a/patterns/nvidia-gpu-efa/eks.tf +++ b/patterns/nvidia-gpu-efa/eks.tf @@ -35,6 +35,15 @@ module "eks" { ami_type = "AL2_x86_64_GPU" instance_types = ["p5.48xlarge"] + + # Uncomment this block and specify capacity_reservation_id + # to use an on-demand capacity reservation (ODCR) for your EFA instances + #capacity_reservation_specification = { + # capacity_reservation_target = { + # capacity_reservation_id = "cr-xxxxxxxxxxxxxxxxx" + # } + #} + pre_bootstrap_user_data = <<-EOT # Mount instance store volumes in RAID-0 for kubelet and containerd # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 diff --git a/patterns/nvidia-gpu-efa/generate-efa-info-test.sh b/patterns/nvidia-gpu-efa/generate-efa-info-test.sh new file mode 100755 index 0000000000..646d9724db --- /dev/null +++ b/patterns/nvidia-gpu-efa/generate-efa-info-test.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +export MPI_JOB_NAME=efa-info-test +export IMAGE_URI=public.ecr.aws/hpc-cloud/nccl-tests:latest +export NUM_WORKERS=2 +export GPU_PER_WORKER=8 +export EFA_PER_WORKER=32 +export TOTAL_GPUS=$((${NUM_WORKERS}*${GPU_PER_WORKER})) + +cat <> efa-info-test.yaml +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: ${MPI_JOB_NAME} +spec: + runPolicy: + cleanPodPolicy: Running + backoffLimit: 20 + slotsPerWorker: ${GPU_PER_WORKER} + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + restartPolicy: OnFailure + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + containers: + - image: ${IMAGE_URI} + name: ${MPI_JOB_NAME}-launcher + imagePullPolicy: IfNotPresent + env: + - name: LD_LIBRARY_PATH + value: "/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib" + - name: PATH + value: "/opt/amazon/efa/bin:/usr/bin" + - name: XLA_FLAGS + value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" + - name: TF_XLA_FLAGS + value: "--tf_xla_cpu_global_jit" + - name: NCCL_DEBUG + value: INFO + command: + - /opt/amazon/openmpi/bin/mpirun + - --allow-run-as-root + - --tag-output + - -np + - "${TOTAL_GPUS}" + - -bind-to + - none + - -map-by + - slot + - -x + - PATH + - -x + - LD_LIBRARY_PATH + - -x + - XLA_FLAGS + - -x + - TF_XLA_FLAGS + - -x + - NCCL_DEBUG=INFO + - --mca + - pml + - ^cm + - --mca + - pml_rsh_agent=ssh + - --oversubscribe + - /opt/amazon/efa/bin/fi_info + - -p + - "efa" + - -t + - "FI_EP_RDM" + Worker: + replicas: ${NUM_WORKERS} + template: + spec: + containers: + - image: ${IMAGE_URI} + name: ${MPI_JOB_NAME}-worker + imagePullPolicy: IfNotPresent + resources: + limits: + nvidia.com/gpu: ${GPU_PER_WORKER} + vpc.amazonaws.com/efa: ${EFA_PER_WORKER} + requests: + nvidia.com/gpu: ${GPU_PER_WORKER} + vpc.amazonaws.com/efa: ${EFA_PER_WORKER} +EOF + diff --git a/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh new file mode 100755 index 0000000000..f16327ac60 --- /dev/null +++ b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +export MPI_JOB_NAME=efa-nccl-test +export IMAGE_URI=public.ecr.aws/hpc-cloud/nccl-tests:latest +export INSTANCE_TYPE=p5.48xlarge +export NUM_WORKERS=2 +export GPU_PER_WORKER=8 +export EFA_PER_WORKER=32 +export TOTAL_GPUS=$((${NUM_WORKERS}*${GPU_PER_WORKER})) + +export NCCL_NVLS_ENABLE=1 +export NCCL_PROTO=LL,LL128,Simple +export NCCL_ALGO=Ring +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 +export RDMAV_FORK_SAFE=1 +export NCCL_SHM_DISABLE=0 + +export HUGEPAGES_2MI=5120Mi +export MEMORY=10000Mi + + +cat <> efa-nccl-test.yaml +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: ${MPI_JOB_NAME} +spec: + runPolicy: + cleanPodPolicy: Running + backoffLimit: 20 + slotsPerWorker: ${GPU_PER_WORKER} + mpiImplementation: "OpenMPI" + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + restartPolicy: OnFailure + containers: + - image: ${IMAGE_URI} + name: ${MPI_JOB_NAME}-launcher + imagePullPolicy: Always + env: + - name: LD_LIBRARY_PATH + value: "/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib" + - name: PATH + value: "/opt/amazon/efa/bin:/usr/bin" + - name: XLA_FLAGS + value: "--xla_gpu_cuda_data_dir=/usr/local/cuda" + - name: TF_XLA_FLAGS + value: "--tf_xla_cpu_global_jit" + - name: NCCL_DEBUG + value: INFO + command: + - /opt/amazon/openmpi/bin/mpirun + - --allow-run-as-root + - --oversubscribe + - --tag-output + #- -N + #- "8" + - -np + - "${TOTAL_GPUS}" + - -bind-to + - none + - -map-by + - slot + - -x + - PATH + - -x + - LD_LIBRARY_PATH + - -x + - XLA_FLAGS + - -x + - TF_XLA_FLAGS + - -x + - NCCL_DEBUG=INFO + #- -x + #- NCCL_NVLS_ENABLE=${NCCL_NVLS_ENABLE} + #- -x + #- NCCL_PROTO=${NCCL_PROTO} + #- -x + #- NCCL_ALGO=${NCCL_ALGO} + #- -x + #- FI_PROVIDER=${FI_PROVIDER} + #- -x + #- FI_EFA_USE_DEVICE_RDMA=${FI_EFA_USE_DEVICE_RDMA} + #- -x + #- RDMAV_FORK_SAFE=${RDMAV_FORK_SAFE} + #- -x + #- NCCL_SHM_DISABLE=${NCCL_SHM_DISABLE} + - --mca + - pml + - ^cm + - --mca + - plm_rsh_agent + - ssh + - /opt/nccl-tests/build/all_reduce_perf + - -b + - "1" + - -e + - 2G + - -f + - "2" + - -t + - "1" + - -g + - "1" + - -c + - "1" + - -n + - "100" + Worker: + replicas: ${NUM_WORKERS} + template: + spec: + nodeSelector: + node.kubernetes.io/instance-type: "${INSTANCE_TYPE}" + containers: + - image: ${IMAGE_URI} + name: ${MPI_JOB_NAME}-worker + imagePullPolicy: Always + volumeMounts: + - name: shmem + mountPath: /dev/shm + resources: + limits: + nvidia.com/gpu: ${GPU_PER_WORKER} + hugepages-2Mi: ${HUGEPAGES_2MI} + vpc.amazonaws.com/efa: ${EFA_PER_WORKER} + memory: ${MEMORY} + requests: + nvidia.com/gpu: ${GPU_PER_WORKER} + hugepages-2Mi: ${HUGEPAGES_2MI} + vpc.amazonaws.com/efa: ${EFA_PER_WORKER} + memory: ${MEMORY} + volumes: + - name: shmem + hostPath: + path: /dev/shm +EOF diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf index 03b3fced85..c67ee3eac9 100644 --- a/patterns/nvidia-gpu-efa/main.tf +++ b/patterns/nvidia-gpu-efa/main.tf @@ -46,7 +46,7 @@ data "aws_availability_zones" "available" {} locals { name = basename(path.cwd) - region = "us-west-2" + region = "us-east-2" vpc_cidr = "10.0.0.0/16" azs = slice(data.aws_availability_zones.available.names, 0, 3)