Skip to content

Commit

Permalink
Run NCCL tests on EKS using MPI operator (#1188)
Browse files Browse the repository at this point in the history
This tests the current latest CUDA DL image's native EFA support.
  • Loading branch information
olupton authored Dec 11, 2024
1 parent bae53fc commit 8dde981
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .github/container/Dockerfile.mpi-operator-compatible-base
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE} as mealkit
FROM mealkit as final
RUN apt-get update \
&& apt install -y openssh-server \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir /run/sshd
# https://github.com/kubeflow/mpi-operator/blob/c738a83b185b4bf3bf7e6eca9d4503653294c995/build/base/Dockerfile#L16
RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& sed -i "s/#\(StrictModes \).*/\1no/g" /etc/ssh/sshd_config
75 changes: 75 additions & 0 deletions .github/eks-workflow-files/mpi-nccl-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: PLACEHOLDER
spec:
# Without this then the first few attempts to run the launcher will result in errors
# due to failed DNS resolution of the worker names. It works eventually, given a big
# enough backoffLimit, but it makes it harder to handle log-streaming and identifying
# the "real" exit code of the job.
launcherCreationPolicy: WaitForWorkersReady
runPolicy:
cleanPodPolicy: Running
# surface errors direct to GitHub Actions without internal retries
backoffLimit: 0
# 1 MPI rank per GPU
slotsPerWorker: 8
mpiReplicaSpecs:
Launcher:
replicas: 1
# Without this the launcher pod will be deleted on failure, which makes it hard
# to provide useful diagnostics
restartPolicy: Never
template:
spec:
containers:
- image: PLACEHOLDER
imagePullPolicy: IfNotPresent
name: PLACEHOLDER
command:
- mpirun
- --allow-run-as-root
- -np
- "16"
- -N
- "8"
- PLACEHOLDER
- -b
- "8"
- -e
- "16G"
- -f
- "2"
- -g
- "1"
- -c
- "1"
- -n
- "100"
imagePullSecrets:
- name: PLACEHOLDER
Worker:
replicas: 2
template:
spec:
nodeSelector:
node.kubernetes.io/instance-type: "p5.48xlarge"
containers:
- image: PLACEHOLDER
imagePullPolicy: IfNotPresent
name: PLACEHOLDER
volumeMounts:
- name: shmem
mountPath: /dev/shm
resources:
limits:
nvidia.com/gpu: 8
hugepages-2Mi: 5120Mi
vpc.amazonaws.com/efa: 32
memory: 32000Mi
imagePullSecrets:
- name: PLACEHOLDER
volumes:
- name: shmem
hostPath:
path: /dev/shm
147 changes: 147 additions & 0 deletions .github/workflows/nccl-k8s.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
name: NCCL on Kubernetes
on:
schedule:
- cron: '30 8 * * *'
pull_request:
types:
- opened
- reopened
- ready_for_review
- synchronize
paths-ignore:
- '**.md'
workflow_dispatch:
inputs:
# Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
# images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
# to be modified to test one of the JAX-Toolbox containers.
CUDA_IMAGE:
type: string
description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04
default: ''
required: false
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
actions: write # to cancel previous workflows
contents: read # to fetch code
packages: write # to upload container
jobs:
build-mpi-operator-compatible-base:
uses: ./.github/workflows/_build.yaml
with:
ARCHITECTURE: amd64
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
BUILD_DATE: 0000-00-00 # not important; this image is never published
BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04' }}
CONTAINER_NAME: mpi-operator-compatible-base
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
RUNNER_SIZE: small
secrets: inherit
# TODO: expand beyond all-reduce
nccl-test:
needs: build-mpi-operator-compatible-base
strategy:
matrix:
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
runs-on: eks
env:
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
TEST_NAME: ${{ matrix.test }}
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Install yq
run: |
mkdir local_bin/
curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
chmod 777 ./local_bin/yq
echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Store GitHub Container Registry token as Kubernetes secret
run: |
# Replace underscores in TEST_NAME with - to make a valid Kubernetes name
JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
LAUNCHER_NAME="${JOB_NAME}-launcher"
TOKEN_NAME="${JOB_NAME}-token"
# Make these available to later steps
echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
kubectl create secret generic \
${TOKEN_NAME} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson
- name: Configure Kubernetes job
run: |
export WORKER_NAME="${JOB_NAME}-worker"
yq -i '.metadata.name = strenv(JOB_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[6] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
.github/eks-workflow-files/mpi-nccl-test.yml
git diff .github/eks-workflow-files/mpi-nccl-test.yml
- name: Submit Kubernetes job
run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Wait for Kubernetes job to start
# Note that this is *not* using JOB_NAME
run: |
# Launcher job is only created once the workers are ready; wait for its
# creation. This is where we block if the cluster is busy executing other jobs,
# but it might be better to impose more of a parallelism limit at the GitHub
# Actions level to keep the Kubernetes queue length modest
kubectl wait --for=create job/${LAUNCHER_NAME} --timeout=3600s
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
done
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
# TODO: --all-containers=true --all-pods=true could make sense here
run: kubectl logs --follow job/${LAUNCHER_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
failure=${status[0]:-0}
success=${status[1]:-0}
total=$((failure+success))
if [[ ${total} < 1 ]]; then
sleep 1
elif [[ ${total} == 1 ]]; then
break
else
# Shouldn't happen, maybe a sign the job being monitored does not have a
# single launcher pod?
exit 255
fi
done
exit ${failure}
# Provide more debug output in case of failure; note that some kinds of launch
# failure do not produce any log output.
- name: Debug failed Kubernetes job
if: failure()
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
if [[ -n "${powd}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
- name: Delete Kubernetes job
if: always()
run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
- name: Delete GitHub Container Registry token
if: always()
run: kubectl delete secret ${TOKEN_NAME}

0 comments on commit 8dde981

Please sign in to comment.