From b4345bcf2ff7d1fcec082254286affe4d4423443 Mon Sep 17 00:00:00 2001 From: Justin Law Date: Wed, 9 Oct 2024 11:21:21 -0400 Subject: [PATCH 1/9] nvidia cuda base image and package --- .github/.gitkeep | 0 .github/workflows/build-test.yaml | 28 ++- .github/workflows/publish-image.yaml | 43 ---- .github/workflows/tag-and-release.yaml | 73 +++++-- README.md | 13 +- docker/Dockerfile | 5 - docker/Dockerfile.gpu | 35 +++ docs/GPU.md | 230 ++++++++++++++++++++ release-please-config.json | 7 +- tasks.yaml | 87 ++++++-- tests/cuda-device-query.yaml | 21 ++ tests/cuda-vector-add.yaml | 21 ++ values/nvidia-gpu-operator-values.yaml | 284 +++++++++++++++++++++++++ zarf.yaml | 104 ++++++++- 14 files changed, 841 insertions(+), 110 deletions(-) delete mode 100644 .github/.gitkeep delete mode 100644 .github/workflows/publish-image.yaml delete mode 100644 docker/Dockerfile create mode 100644 docker/Dockerfile.gpu create mode 100644 docs/GPU.md create mode 100644 tests/cuda-device-query.yaml create mode 100644 tests/cuda-vector-add.yaml create mode 100644 values/nvidia-gpu-operator-values.yaml diff --git a/.github/.gitkeep b/.github/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml index 4424c3c..67e7dd2 100644 --- a/.github/workflows/build-test.yaml +++ b/.github/workflows/build-test.yaml @@ -7,35 +7,31 @@ on: - "docs/**" - "CODEOWNERS" -permissions: - id-token: write - contents: read - jobs: test-clean-install: runs-on: ubuntu-latest + + permissions: + id-token: write + contents: read + strategy: matrix: - image: ["rancher/k3s"] - version: ["v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] + version: + ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4 - name: Setup UDS - if: always() uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 with: - username: ${{secrets.IRON_BANK_ROBOT_USERNAME}} - password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}} - - # Step is not currently being used, could be uncommented if custom image support is needed in the future - # - name: Build the custom k3s image - # if: ${{matrix.image}} != "rancher/k3s" - # run: uds run build-image --set VERSION=${{matrix.version}} --no-progress + registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} + registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} - name: Create and deploy the uds-k3d package - run: uds run --set IMAGE_NAME=${{matrix.image}} --set VERSION=${{matrix.version}} --no-progress + run: uds run --set K3S_IMAGE_VERSION=${{matrix.version}} --no-progress - name: Validate uds-k3d package run: uds run validate --no-progress diff --git a/.github/workflows/publish-image.yaml b/.github/workflows/publish-image.yaml deleted file mode 100644 index 64ce092..0000000 --- a/.github/workflows/publish-image.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: Publish k3s image - -# Workflow is not currently being used, switched to a manual trigger only -on: workflow_dispatch - # push: - # branches: - # - main - # paths: - # - docker/** - # - .github/workflows/publish-image.yaml - -jobs: - publish-k3s-image: - runs-on: ubuntu-latest - strategy: - matrix: - version: ["v1.2.3-k3s1"] # Placeholder - - permissions: - contents: read - packages: write - - steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4 - - - uses: docker/setup-buildx-action@8026d2bc3645ea78b0d2544766a1225eb5691f89 # v3.7.0 - - - name: Setup UDS - if: always() - uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 - with: - username: ${{secrets.IRON_BANK_ROBOT_USERNAME}} - password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}} - - - name: Login to GHCR - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3 - with: - registry: ghcr.io - username: dummy - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Publish the custom k3s image - run: uds run publish-image --set VERSION=${{matrix.version}} --no-progress diff --git a/.github/workflows/tag-and-release.yaml b/.github/workflows/tag-and-release.yaml index d6f2a26..ae0833c 100644 --- a/.github/workflows/tag-and-release.yaml +++ b/.github/workflows/tag-and-release.yaml @@ -7,8 +7,10 @@ on: jobs: tag-new-version: - permissions: write-all runs-on: ubuntu-latest + + permissions: write-all + outputs: release_created: ${{ steps.release-flag.outputs.release_created }} steps: @@ -25,28 +27,73 @@ jobs: if: ${{ needs.tag-new-version.outputs.release_created == 'true'}} runs-on: ubuntu-latest + strategy: + matrix: + k3s_image_repository: ["rancher/k3s"] + k3s_image_version: + ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] + permissions: contents: read packages: write steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4 - name: Setup UDS - if: always() uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 with: - username: ${{secrets.IRON_BANK_ROBOT_USERNAME}} - password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}} + registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} + registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} + + - name: Publish the base capability + run: | + uds zarf package create --confirm -a arm64 -o oci://ghcr.io/defenseunicorns/packages \ + --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \ + --set K3S_IMAGE_VERSION=${{ matrix.k3s_image_version }} + + uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages \ + --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \ + --set K3S_IMAGE_VERSION=${{ matrix.k3s_image_version }} + + publish-uds-cuda-package: + needs: tag-new-version + if: ${{ needs.tag-new-version.outputs.release_created == 'true'}} + runs-on: ubuntu-latest + + strategy: + matrix: + k3s_image_repository: ["rancher/k3s"] + k3s_image_version: + ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] + cuda_image_version: + [ + 11.8.0-base-ubuntu22.04, + 12.1.0-base-ubuntu22.04, + 12.5.0-base-ubuntu22.04, + ] + + steps: + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4 - - name: Login to GHCR - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3 + - uses: docker/setup-buildx-action@8026d2bc3645ea78b0d2544766a1225eb5691f89 # v3.7.0 + + - name: Setup UDS + uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 with: - registry: ghcr.io - username: dummy - password: ${{ secrets.GITHUB_TOKEN }} + registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} + registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} + + - name: Publish the CUDA K3s image + run: | + uds run publish-cuda-image \ + --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \ + --set K3S_IMAGE_VERSION="${{ matrix.k3s_image_version }}" \ + --set CUDA_IMAGE_VERSION="${{ matrix.cuda_image_version }}" \ + --no-progress - - name: Publish the capability + - name: Publish the CUDA capability run: | - uds zarf package create --confirm -a arm64 -o oci://ghcr.io/defenseunicorns/packages - uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages + uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages --flavor cuda diff --git a/README.md b/README.md index ab4b3d7..a62108b 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,17 @@ sudo ssh -N -L 80:localhost:80 -L 443:localhost:443 -L 6550:localhost:6550 [!NOTE] > UDS K3d intentionally does not address airgap concerns for K3d or the load balancer logic deployed in this package. This allows running `zarf init` or deploying a Zarf Init Package via a UDS Bundle after the UDS K3d environment is deployed. -## Prerequisites +## Pre-Requisites - [UDS CLI](https://github.com/defenseunicorns/uds-cli/blob/main/README.md#install) & [K3d](https://k3d.io/#installation) using the versions specified in the [uds-common repo](https://github.com/defenseunicorns/uds-common/blob/main/README.md#supported-tool-versions) - [Docker](https://docs.docker.com/get-docker/) or [Podman](https://podman.io/getting-started/installation) for running K3d +- See the [GPU Configuration](./docs/GPU.md) information for more details on enabling NVIDIA GPU support within the cluster ## Deploy -`uds zarf package deploy oci://defenseunicorns/uds-k3d:0.9.0` +`uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:0.11.2` @@ -53,13 +54,13 @@ k3d cluster start uds ## Additional Info -You can set extra k3d args by setting the deploy-time ZARF_VAR_K3D_EXTRA_ARGS. See below `zarf-config.yaml` example k3d args: +You can set extra K3d arguments by setting the deploy-time `ZARF_VAR_K3D_EXTRA_ARGS`. See below `zarf-config.yaml` example below for K3d args examples: ```yaml package: deploy: set: - k3d_extra_args: "--k3s-arg --gpus=1 --k3s-arg --=" + k3d_extra_args: --k3s-arg "--=@server:*" --gpus=all ``` ### Configure MinIO @@ -69,3 +70,7 @@ package: ### DNS Assumptions - [DNS Assumptions](docs/DNS.md) + +### Enabling GPU Support + +- [GPU Workload Configuration](docs/GPU.md) diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 8c4797f..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -ARG K3S_TAG="v1.2.3-k3s1" # Placeholder - -FROM rancher/k3s:$K3S_TAG as k3s - -# Do custom image things diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu new file mode 100644 index 0000000..1e7339b --- /dev/null +++ b/docker/Dockerfile.gpu @@ -0,0 +1,35 @@ +ARG K3S_REPOSITORY="rancher/k3s" +ARG K3S_TAG="v1.30.4-k3s1" +ARG CUDA_TAG="12.1.0-base-ubuntu22.04" + +FROM $K3S_REPOSITORY:$K3S_TAG AS k3s + +FROM nvidia/cuda:$CUDA_TAG + +# Install the NVIDIA container toolkit +RUN apt-get update && \ + apt-get install -y curl && \ + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \ + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + apt-get update && \ + apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux && \ + nvidia-ctk runtime configure --runtime=containerd + +COPY --from=k3s / / --exclude=/bin/ +COPY --from=k3s /bin /bin + +VOLUME /var/lib/kubelet +VOLUME /var/lib/rancher/k3s +VOLUME /var/lib/cni +VOLUME /var/log + +# Resolve fsnotify issues +RUN sysctl -w fs.inotify.max_user_watches=100000 && \ + sysctl -w fs.inotify.max_user_instances=100000 + +ENV PATH="$PATH:/bin/aux" + +ENTRYPOINT ["/bin/k3s"] +CMD ["agent"] diff --git a/docs/GPU.md b/docs/GPU.md new file mode 100644 index 0000000..8fabfa5 --- /dev/null +++ b/docs/GPU.md @@ -0,0 +1,230 @@ +# GPU + +UDS K3d comes with optional base images that provide GPU scheduling in the cluster to allow for GPU-accelerated workloads (e.g., LLMs). Currently, UDS K3d only supports NVIDIA CUDA-capable GPUs, with considerations for supporting AMD and other workloads in the future. + +## NVIDIA + +### Pre-Requisites + +### NVIDIA Drivers + +- Ensure that the proper [NVIDIA drivers](https://www.nvidia.com/download/index.aspx) are installed (>=525.60). +- Follow the [driver download](https://www.nvidia.com/download/index.aspx) by identifying your hardware from the provided list. + +### NVIDIA Container Toolkit + +- [Read the pre-requisites for installation and follow the instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) to download and install the NVIDIA container toolkit (>=1.14). +- After the successful installation off the toolkit, follow the [toolkit instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) to verify that your default Docker runtime is configured for NVIDIA: + + ```bash + nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json + ``` + +- Verify that `nvidia` is now a runtime available to the Docker daemon to use: + + ```bash + # the expected output should be similar to: `Runtimes: io.containerd.runc.v2 nvidia runc` + docker info | grep -i nvidia + ``` + +- [Try out a sample CUDA workload](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html) to ensure your Docker containers have access to the GPUs after configuration. +- (OPTIONAL) You can configure Docker to use the `nvidia` runtime by default by adding the `--set-as-default` flag during the container toolkit post-installation configuration step by running the following command: + + ```bash + nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json --set-as-default + ``` + +- (OPTIONAL) Verify that the default runtime is changed by running the following command: + + ```bash + # the expected output should be similar to: `Default Runtime: nvidia` + docker info | grep "Default Runtime" + ``` + +### Usage + +#### Local Build and Deployment + +To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following: + +```bash +uds run default-cuda +``` + +#### Remote Package Deployment + +To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following: + + + +```bash +export PACKAGE_VERSION=0.11.2 +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm +``` + + + +##### Additional Base Images + +This repository publishes several variations of the underlying K3d image and CUDA image so that it covers more compatibility cases (e.g., GPU driver versions, K3d versions, etc.). Please see the [published images](https://github.com/defenseunicorns/uds-k3d/pkgs/container/uds-k3d%2Fcuda-k3s) for all possible variations. + +Below are some examples of setting these variables to choose a different variation at deploy-time: + +```bash +uds run default-cuda --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="12.1.0-base-ubuntu22.04" +# OR +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.31.0-k3s1" --set CUDA_IMAGE_VERSION="12.5.0-base-ubuntu22.04" +# OR +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="11.8.0-base-ubuntu22.04" +``` + +#### Tests + +This repository includes two CUDA workload tests that can be executed: + +```bash +uds run validate-cuda # device info query +uds run validate-cuda --set CUDA_TEST="cuda-vector-add" # vector addition +``` + +### Troubleshooting + +#### NVML Errors or Missing CUDA Dependencies + +None of the following should ever error or return `unknown version`: + +1. Check if your NVIDIA GPU drivers are installed: + + ```bash + nvidia-smi + ``` + +2. Check the version of your NVIDIA Container Toolkit: + + ```bash + nvidia-ctk --version + ``` + +3. Try looking at your Docker runtime information and make sure the following returns with several lines of information: + + ```bash + docker info | grep "nvidia" + ``` + +4. Try running a CUDA sample test in the cluster: [CUDA Vector Add](https://github.com/NVIDIA/k8s-device-plugin/blob/a6a7ce12d28618d343c251ca0941222d7b8a46d3/README.md?plain=1#L145). + +#### Memory Errors or Process Locks + +If you are, not deploying a fresh cluster or fresh packages (e.g., a GPU workload is already deployed) or you have a GPU that has other workloads on it (e.g., display), then there may not be enough resources to offload the workloads to the NVIDIA GPU. + +1. To see what host-level processes are on your NVIDIA GPU(s) run the following: + + ```bash + nvidia-smi + ``` + +2. To check which pods are scheduled with GPU resources in particular, you can run the following `uds zarf tools yq` command: + + ```bash + uds zarf tools kubectl get pods \ + --all-namespaces \ + --output=yaml \ + | uds zarf tools yq eval -o=json ' + ["Pod", "Namespace", "Container", "GPU"] as $header | + [$header] + [ + .items[] | + .metadata as $metadata | + .spec.containers[] | + select(.resources.requests["nvidia.com/gpu"]) | + [ + $metadata.name, + $metadata.namespace, + .name, + .resources.requests["nvidia.com/gpu"] + ] + ]' - \ + | uds zarf tools yq -r '(.[0] | @tsv), (.[1:][] | @tsv)' \ + | column -t -s $'\t' + ``` + +When you reinstall or start a new GPU-dependent pod, the previous PID (process) on the GPU may not have been flushed yet. + +1. Scale the previous GPU-dependent pod deployment down to 0, as the current `RollingUpdate` strategy for vLLM relies on back-up/secondary GPUs to be available for a graceful turnover +2. Use `nvidia-smi` to check if the process has been flushed upon Pod termination BEFORE you deploy a new GPU-dependent pod, and if not, use `kill -9 ` to manually flush the process + +#### MacOS + +UDS K3d's NVIDIA GPU support does not work on MacOS. + +#### Windows (WSL2) + +The NVIDIA GPU Operator does not work on WSL2 as of version v24.3.0 (see [issue](https://github.com/NVIDIA/gpu-operator/issues/318)); however, the NVIDIA Device Plugin, by itself, does work as of version 0.15.0-rc1 (see [comment](https://github.com/NVIDIA/k8s-device-plugin/issues/332#issuecomment-1927997436)). + +To get around this issue, the recommended course of action is to install UDS K3d without the `cuda` flavor, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so: + +1. Run `uds run default --set K3D_EXTRA_ARGS="--gpus=all"` or `uds zarf package deploy oci://defenseunicorns/uds-k3d:${PACKAGE_VERSION} --confirm --set K3D_EXTRA_ARGS="--gpus=all"` +2. Create an `nvidia-device-plugin.yaml` manifest like the one below, and a deploy it with `uds zarf tools kubectl apply -f nvidia-device-plugin.yaml` + + ```yaml + apiVersion: node.k8s.io/v1 + kind: RuntimeClass + metadata: + name: nvidia + handler: nvidia + --- + apiVersion: apps/v1 + kind: DaemonSet + metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system + spec: + selector: + matchLabels: + name: nvidia-device-plugin-daemonset + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-daemonset + spec: + runtimeClassName: nvidia # Explicitly request the runtime + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 + name: nvidia-device-plugin-ctr + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + - name: MPS_ROOT + value: /run/nvidia/mps + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + ``` diff --git a/release-please-config.json b/release-please-config.json index 85c883d..97e8e28 100644 --- a/release-please-config.json +++ b/release-please-config.json @@ -10,7 +10,12 @@ { "type": "chore", "section": "Miscellaneous", "hidden": false } ], "versioning": "default", - "extra-files": ["README.md", "zarf.yaml", "chart/Chart.yaml"] + "extra-files": [ + "README.md", + "zarf.yaml", + "chart/Chart.yaml", + "docs/GPU.md" + ] } } } diff --git a/tasks.yaml b/tasks.yaml index 00fb80c..3269a5d 100644 --- a/tasks.yaml +++ b/tasks.yaml @@ -1,38 +1,63 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.16.0/tasks.schema.json + variables: - - name: VERSION - default: "v1.30.4-k3s1" - - name: IMAGE_NAME + - name: K3S_IMAGE_REPOSITORY default: "rancher/k3s" + - name: K3S_IMAGE_VERSION + default: "v1.30.4-k3s1" + - name: CUDA_IMAGE_VERSION + default: "12.1.0-base-ubuntu22.04" - name: K3D_EXTRA_ARGS default: "" - name: NGINX_EXTRA_PORTS default: "[]" + - name: CUDA_TEST + default: "cuda-device-query" tasks: - name: default description: "Build and deploy uds-k3d" actions: - - description: "Build UDS K3d package" - cmd: "uds zarf package create --confirm --no-progress" + - description: "Build the uds-k3d package" + cmd: "rm -rf zarf-package-uds-k3d-*.tar.zst && uds zarf package create --confirm --no-progress" - - description: "Deploy UDS K3d package" + - description: "Deploy the uds-k3d package" cmd: | uds zarf package deploy zarf-package-uds-k3d-*.tar.zst \ - --set K3D_IMAGE=${IMAGE_NAME}:${VERSION} \ + --set K3S_IMAGE_REPOSITORY=${K3S_IMAGE_REPOSITORY} \ + --set K3S_IMAGE_VERSION=${K3S_IMAGE_VERSION} \ + --set K3D_EXTRA_ARGS="${K3D_EXTRA_ARGS}" \ + --set NGINX_EXTRA_PORTS="${NGINX_EXTRA_PORTS}" \ + --no-progress --confirm + + - name: default-cuda + description: "Build and deploy uds-k3d with CUDA support" + actions: + - description: "Build the uds-k3d CUDA package" + cmd: "rm -rf zarf-package-uds-k3d-*.tar.zst && uds zarf package create --flavor cuda --confirm --no-progress" + + - description: "Build the k3s-cuda image locally" + task: build-cuda-image + + - description: "Deploy the uds-k3d CUDA package" + cmd: | + uds zarf package deploy zarf-package-uds-k3d-*.tar.zst \ + --set K3S_IMAGE_VERSION=${K3S_IMAGE_VERSION} \ + --set CUDA_IMAGE_VERSION=${CUDA_IMAGE_VERSION} \ --set K3D_EXTRA_ARGS="${K3D_EXTRA_ARGS}" \ --set NGINX_EXTRA_PORTS="${NGINX_EXTRA_PORTS}" \ --no-progress --confirm - name: validate actions: - - description: Validate coredns is up + - description: "Validate CoreDNS is up" wait: cluster: kind: Pod name: "k8s-app=kube-dns" namespace: kube-system condition: Ready - - description: Validate coredns is resolving *.uds.dev internally + - description: "Validate CoreDNS is resolving *.uds.dev internally" cmd: | set -e FOO_IP=$(uds zarf tools kubectl run dig-test --image=arunvelsriram/utils -q --restart=Never --rm -i -- dig +short foo.uds.dev) @@ -42,19 +67,45 @@ tasks: echo "CoreDNS patch failed, foo.uds.dev is resolving to 127.0.0.1" exit 1 fi - - description: Validate zarf init + - description: "Validate zarf init" cmd: | set -e uds zarf tools download-init --no-progress # Test zarf init due to containerd issue - https://github.com/defenseunicorns/zarf/issues/592 uds zarf init --confirm --no-progress - # - name: build-image - # actions: - # - description: Build the custom k3s image - # cmd: docker build -t ${IMAGE_NAME}:${VERSION} --build-arg K3S_TAG=${VERSION} docker/ + - name: validate-cuda + description: "Run a CUDA test workload in-cluster" + actions: + - description: "Deploy the device query test pod to the cluster" + cmd: | + uds zarf tools kubectl apply -f tests/${CUDA_TEST}.yaml + - description: "Await test completion and then display the test results" + cmd: | + uds zarf tools wait-for Pod cuda-test-pod '{.status.phase}'=Succeeded -n default --no-progress --timeout 120s + uds zarf tools kubectl logs -l app=cuda-test-pod -n default + - description: "Remove the completed test pod" + cmd: | + uds zarf tools kubectl delete Pod cuda-test-pod + + - name: build-cuda-image + actions: + - description: "Build the CUDA K3s image" + cmd: | + docker build -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \ + --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \ + --build-arg K3S_TAG=${K3S_IMAGE_VERSION} \ + --build-arg CUDA_TAG=${CUDA_IMAGE_VERSION} \ + -f docker/Dockerfile.gpu . -q - # - name: publish-image - # actions: - # - description: Publish the custom k3s image - # cmd: docker buildx build --push --platform linux/arm64/v8,linux/amd64 --tag ${IMAGE_NAME}:${VERSION} docker + - name: publish-cuda-image + actions: + - description: "Publish the CUDA K3s image" + cmd: | + docker buildx build --push \ + --platform linux/amd64 \ + --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \ + --build-arg K3S_TAG=${K3S_IMAGE_VERSION} \ + --build-arg CUDA_TAG=${CUDA_IMAGE_VERSION} \ + -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \ + -f docker/Dockerfile.gpu . -q diff --git a/tests/cuda-device-query.yaml b/tests/cuda-device-query.yaml new file mode 100644 index 0000000..b1b95da --- /dev/null +++ b/tests/cuda-device-query.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: cuda-test-pod + labels: + app: cuda-test-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0-ubuntu22.04 + resources: + limits: + nvidia.com/gpu: "1" # requesting 1 GPU + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/tests/cuda-vector-add.yaml b/tests/cuda-vector-add.yaml new file mode 100644 index 0000000..a62ba98 --- /dev/null +++ b/tests/cuda-vector-add.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: cuda-test-pod + labels: + app: cuda-test-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04 + resources: + limits: + nvidia.com/gpu: "1" # requesting 1 GPU + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/values/nvidia-gpu-operator-values.yaml b/values/nvidia-gpu-operator-values.yaml new file mode 100644 index 0000000..a35fe71 --- /dev/null +++ b/values/nvidia-gpu-operator-values.yaml @@ -0,0 +1,284 @@ +# See the NVIDIA GPU Operator repository for more details on available values +# https://github.com/NVIDIA/gpu-operator/blob/main/deployments/gpu-operator/values.yaml + +platform: + openshift: false + +nfd: + # usually enabled by default, but choose to use external NFD from IronBank + enabled: true + nodefeaturerules: false + +psa: + enabled: false + +cdi: + enabled: false + +sandboxWorkloads: + enabled: false + +hostPaths: + # rootFS represents the path to the root filesystem of the host. + # This is used by components that need to interact with the host filesystem + # and as such this must be a chroot-able filesystem. + # Examples include the MIG Manager and Toolkit Container which may need to + # stop, start, or restart systemd services + rootFS: "/" + + # driverInstallDir represents the root at which driver files including libraries, + # config files, and executables can be found. + driverInstallDir: "/run/nvidia/driver" + +daemonsets: + labels: {} + annotations: {} + priorityClassName: system-node-critical + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands + # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions + updateStrategy: "RollingUpdate" + # configuration for controlling rolling update of GPU Operands + rollingUpdate: + # maximum number of nodes to simultaneously apply pod updates on. + # can be specified either as number or percentage of nodes. Default 1. + maxUnavailable: "1" + +validator: + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + resources: {} + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + driver: + env: + - name: DISABLE_DEV_CHAR_SYMLINK_CREATION + value: "true" + - name: NVIDIA_VISIBLE_DEVICES + value: all + # Default value of "all" causes the "display" capability to also be considered; + # however, not all hosts have or allow that capability, causing the daemonset to fail + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + +operator: + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + priorityClassName: system-node-critical + # Can be set to `containerd`, `docker`, etc. + defaultRuntime: docker + runtimeClass: nvidia + use_ocp_driver_toolkit: false + # cleanup CRD on chart un-install + cleanupCRD: false + # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag + # to be passed during helm upgrade. + upgradeCRD: false + initContainer: + image: cuda + + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + annotations: + openshift.io/scc: restricted-readonly + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: In + values: [""] + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: In + values: [""] + logging: + # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano') + timeEncoding: epoch + # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity + level: info + # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn) + # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error) + develMode: true + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + +mig: + strategy: single + +driver: + # usually enabled by default, depends on deployment environment + enabled: false + +toolkit: + # usually enabled by default, depends on deployment environment + enabled: false + +devicePlugin: + enabled: true + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + # Default value of "all" causes the "display" capability to also be considered; + # however, not all hosts have or allow that capability, causing the daemonset to fail + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + resources: {} + config: + # Create a ConfigMap (default: false) + create: false + # ConfigMap name (either exiting or to create a new one with create=true above) + name: "" + # Default config name within the ConfigMap + default: "" + # Data section for the ConfigMap to create (i.e only applies when create=true) + data: {} + # MPS related configuration for the plugin + mps: + # MPS root path on the host + root: "/run/nvidia/mps" + +# standalone dcgm host engine +dcgm: + # disabled by default to use embedded nv-host engine by exporter + enabled: false + +dcgmExporter: + enabled: true + imagePullPolicy: IfNotPresent + env: + - name: DCGM_EXPORTER_LISTEN + value: ":9400" + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + - name: DCGM_EXPORTER_COLLECTORS + value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + resources: {} + serviceMonitor: + enabled: false + interval: 15s + honorLabels: false + additionalLabels: {} + relabelings: [] + +gfd: + enabled: true + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: "true" + resources: {} + +migManager: + # usually enabled by default, depends on deployment environment + enabled: false + +nodeStatusExporter: + enabled: false + +gds: + enabled: false + +gdrcopy: + enabled: false + +vgpuManager: + enabled: false + +vgpuDeviceManager: + # usually enabled by default, depends on deployment environment + enabled: false + +vfioManager: + # usually enabled by default, depends on deployment environment + enabled: false + +kataManager: + enabled: false + +sandboxDevicePlugin: + # usually enabled by default, depends on deployment environment + enabled: false + +ccManager: + enabled: false + +node-feature-discovery: + enableNodeFeatureApi: true + gc: + enable: true + replicaCount: 1 + serviceAccount: + name: node-feature-discovery + create: false + worker: + serviceAccount: + name: node-feature-discovery + # disable creation to avoid duplicate serviceaccount creation by master spec below + create: false + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + config: + sources: + pci: + deviceClassWhitelist: + - "02" + - "0200" + - "0207" + - "0300" + - "0302" + deviceLabelFields: + - vendor + master: + serviceAccount: + name: node-feature-discovery + create: true + config: + extraLabelNs: ["nvidia.com"] diff --git a/zarf.yaml b/zarf.yaml index 9a9d19f..2abfdca 100644 --- a/zarf.yaml +++ b/zarf.yaml @@ -4,7 +4,7 @@ kind: ZarfPackageConfig metadata: name: uds-k3d description: "UDS K3d Cluster Setup. WARNING: This will destroy the cluster if it already exists." - url: https://github.com/defenseunicorns/uds-k3d + url: https://github.com/justinthelaw/uds-k3d yolo: true # x-release-please-start-version version: "0.9.0" @@ -15,9 +15,21 @@ variables: description: "Name of the cluster" default: "uds" - - name: K3D_IMAGE - description: "K3d image to use" - default: "rancher/k3s:v1.30.4-k3s1" + - name: K3S_IMAGE_REPOSITORY + description: "K3s image repository to use" + default: "rancher/k3s" + + - name: K3S_IMAGE_VERSION + description: "K3d image version to use" + default: "v1.30.4-k3s1" + + - name: CUDA_IMAGE_VERSION + description: "CUDA image to use" + default: "12.1.0-base-ubuntu22.04" + + - name: NUMBER_OF_GPUS + description: "Number of GPUs to passthrough to the K3D cluster" + default: "all" - name: K3D_EXTRA_ARGS description: "Optionally pass k3d arguments to the default" @@ -37,6 +49,43 @@ components: - cmd: k3d cluster delete ${ZARF_VAR_CLUSTER_NAME} description: "Destroy the cluster" + - name: set-k3d-image + required: true + description: "Set the K3s base image" + actions: + onDeploy: + before: + - cmd: | + echo "${ZARF_VAR_K3S_IMAGE_REPOSITORY}:${ZARF_VAR_K3S_IMAGE_VERSION}" + setVariables: + - name: K3D_IMAGE + + - name: inject-cuda-image + required: true + only: + flavor: cuda + description: "Overwrites the K3s base image variable to be the CUDA K3s image" + actions: + onDeploy: + before: + - cmd: | + echo "ghcr.io/justinthelaw/uds-k3d/cuda-k3s:${ZARF_VAR_K3S_IMAGE_VERSION}-cuda-${ZARF_VAR_CUDA_IMAGE_VERSION}" + setVariables: + - name: K3D_IMAGE + + - name: expose-gpus + required: true + only: + flavor: cuda + description: "Adds the extra K3d argument for exposing host GPUs to the cluster" + actions: + onDeploy: + before: + - cmd: | + echo "${ZARF_VAR_K3D_EXTRA_ARGS} --gpus=${ZARF_VAR_NUMBER_OF_GPUS}" + setVariables: + - name: K3D_EXTRA_ARGS + - name: create-cluster required: true description: "Create the k3d cluster" @@ -52,8 +101,9 @@ components: --k3s-arg "--disable=metrics-server@server:*" \ --k3s-arg "--disable=servicelb@server:*" \ --k3s-arg "--disable=local-storage@server:*" \ - --image ${ZARF_VAR_K3D_IMAGE} ${ZARF_VAR_K3D_EXTRA_ARGS} \ - ${ZARF_VAR_CLUSTER_NAME} + ${ZARF_VAR_K3D_EXTRA_ARGS} \ + --image ${ZARF_VAR_K3D_IMAGE} \ + "${ZARF_VAR_CLUSTER_NAME}" description: "Create the cluster" onSuccess: - cmd: | @@ -73,12 +123,12 @@ components: actions: onDeploy: before: - - cmd: ./zarf tools kubectl get nodes -o=jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | cut -d'.' -f1-3 - description: "Load network ip base for MetalLB" + - cmd: uds zarf tools kubectl get nodes -o=jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | cut -d'.' -f1-3 + description: "Load network base IP for MetalLB" setVariables: - name: BASE_IP after: - - cmd: ./zarf tools kubectl rollout restart deployment coredns -n kube-system + - cmd: uds zarf tools kubectl rollout restart deployment coredns -n kube-system description: "Restart CoreDNS to pick up internal DNS override for uds.dev" charts: - name: metallb @@ -89,7 +139,7 @@ components: namespace: uds-dev-stack localPath: chart # x-release-please-start-version - version: 0.9.0 + version: 0.11.2 # x-release-please-end valuesFiles: - "values/dev-stack-values.yaml" @@ -104,3 +154,37 @@ components: url: https://charts.min.io/ valuesFiles: - "values/minio-values.yaml" + + - name: nvidia-gpu-operator + description: "Install the NVIDIA GPU Operator for CUDA-enabled clusters" + only: + flavor: cuda + required: true + charts: + - name: gpu-operator + url: https://helm.ngc.nvidia.com/nvidia + version: v24.3.0 + namespace: kube-system + valuesFiles: + - "values/nvidia-gpu-operator-values.yaml" + actions: + onDeploy: + after: + - description: "Validate nvidia-device-plugin-daemonset is up" + wait: + cluster: + kind: Pod + name: app=nvidia-device-plugin-daemonset + namespace: kube-system + # Ensure the device plugin is healthy, which might take a while depending on the machine + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 600 + - description: "Validate nvidia-operator-validator is completed" + wait: + cluster: + kind: Pod + name: app=nvidia-operator-validator + namespace: kube-system + # Ensure the NVIDIA host validator job succeeds + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 300 From ca327997220cbfe2aa7f7f1a8eb390145e89d145 Mon Sep 17 00:00:00 2001 From: Justin Law Date: Wed, 9 Oct 2024 11:22:32 -0400 Subject: [PATCH 2/9] remove incorrect org repo --- tasks.yaml | 4 ++-- zarf.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks.yaml b/tasks.yaml index 3269a5d..35e0987 100644 --- a/tasks.yaml +++ b/tasks.yaml @@ -90,7 +90,7 @@ tasks: - name: build-cuda-image actions: - - description: "Build the CUDA K3s image" + - description: "Build the k3s-cuda image" cmd: | docker build -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \ --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \ @@ -100,7 +100,7 @@ tasks: - name: publish-cuda-image actions: - - description: "Publish the CUDA K3s image" + - description: "Publish the k3s-cuda image" cmd: | docker buildx build --push \ --platform linux/amd64 \ diff --git a/zarf.yaml b/zarf.yaml index 2abfdca..0761026 100644 --- a/zarf.yaml +++ b/zarf.yaml @@ -4,7 +4,7 @@ kind: ZarfPackageConfig metadata: name: uds-k3d description: "UDS K3d Cluster Setup. WARNING: This will destroy the cluster if it already exists." - url: https://github.com/justinthelaw/uds-k3d + url: https://github.com/defenseunicorns/uds-k3d yolo: true # x-release-please-start-version version: "0.9.0" @@ -69,7 +69,7 @@ components: onDeploy: before: - cmd: | - echo "ghcr.io/justinthelaw/uds-k3d/cuda-k3s:${ZARF_VAR_K3S_IMAGE_VERSION}-cuda-${ZARF_VAR_CUDA_IMAGE_VERSION}" + echo "ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${ZARF_VAR_K3S_IMAGE_VERSION}-cuda-${ZARF_VAR_CUDA_IMAGE_VERSION}" setVariables: - name: K3D_IMAGE From d8c6637d1083d61b3741ac9b81662f0cd12ac54b Mon Sep 17 00:00:00 2001 From: Justin Law Date: Wed, 9 Oct 2024 11:24:53 -0400 Subject: [PATCH 3/9] generalize test description name --- tasks.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks.yaml b/tasks.yaml index 35e0987..732c10f 100644 --- a/tasks.yaml +++ b/tasks.yaml @@ -77,7 +77,7 @@ tasks: - name: validate-cuda description: "Run a CUDA test workload in-cluster" actions: - - description: "Deploy the device query test pod to the cluster" + - description: "Deploy the test pod to the cluster" cmd: | uds zarf tools kubectl apply -f tests/${CUDA_TEST}.yaml - description: "Await test completion and then display the test results" From cec1c4e5fb947d9d27fad9598238acf5b49cff6f Mon Sep 17 00:00:00 2001 From: Justin Law Date: Wed, 9 Oct 2024 11:43:57 -0400 Subject: [PATCH 4/9] align build-test matrix with publishing one --- .github/workflows/build-test.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml index 67e7dd2..93ae97c 100644 --- a/.github/workflows/build-test.yaml +++ b/.github/workflows/build-test.yaml @@ -17,7 +17,8 @@ jobs: strategy: matrix: - version: + k3s_image_repository: ["rancher/k3s"] + k3s_image_version: ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] steps: @@ -31,7 +32,7 @@ jobs: ghToken: ${{ secrets.GITHUB_TOKEN }} - name: Create and deploy the uds-k3d package - run: uds run --set K3S_IMAGE_VERSION=${{matrix.version}} --no-progress + run: uds run --set K3S_IMAGE_VERSION=${{matrix.k3s_image_version}} K3S_IMAGE_REPOSITORY=${{matrix.k3s_image_repository}} --no-progress - name: Validate uds-k3d package run: uds run validate --no-progress From ae9d9b19cda791c521fe4ce3cd8ee4a2b8d65cee Mon Sep 17 00:00:00 2001 From: Justin Law Date: Wed, 9 Oct 2024 11:45:52 -0400 Subject: [PATCH 5/9] fix --set typo --- .github/workflows/build-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml index 93ae97c..b3e29a1 100644 --- a/.github/workflows/build-test.yaml +++ b/.github/workflows/build-test.yaml @@ -32,7 +32,7 @@ jobs: ghToken: ${{ secrets.GITHUB_TOKEN }} - name: Create and deploy the uds-k3d package - run: uds run --set K3S_IMAGE_VERSION=${{matrix.k3s_image_version}} K3S_IMAGE_REPOSITORY=${{matrix.k3s_image_repository}} --no-progress + run: uds run --set K3S_IMAGE_VERSION=${{matrix.k3s_image_version}} --set K3S_IMAGE_REPOSITORY=${{matrix.k3s_image_repository}} --no-progress - name: Validate uds-k3d package run: uds run validate --no-progress From 6a8081f568fc379fe975ef8d64b23baad5df0972 Mon Sep 17 00:00:00 2001 From: Justin Law Date: Wed, 9 Oct 2024 11:48:00 -0400 Subject: [PATCH 6/9] remove erroneous version, go to 0.9.0 --- README.md | 2 +- docs/GPU.md | 2 +- zarf.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a62108b..ec7f236 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ sudo ssh -N -L 80:localhost:80 -L 443:localhost:443 -L 6550:localhost:6550 -`uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:0.11.2` +`uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:0.9.0` diff --git a/docs/GPU.md b/docs/GPU.md index 8fabfa5..5a81351 100644 --- a/docs/GPU.md +++ b/docs/GPU.md @@ -58,7 +58,7 @@ To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute t ```bash -export PACKAGE_VERSION=0.11.2 +export PACKAGE_VERSION=0.9.0 uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm ``` diff --git a/zarf.yaml b/zarf.yaml index 0761026..5a61989 100644 --- a/zarf.yaml +++ b/zarf.yaml @@ -139,7 +139,7 @@ components: namespace: uds-dev-stack localPath: chart # x-release-please-start-version - version: 0.11.2 + version: 0.9.0 # x-release-please-end valuesFiles: - "values/dev-stack-values.yaml" From 0dc3f1450a055ab2b93cd62345e94c1064693116 Mon Sep 17 00:00:00 2001 From: Justin Law <81255462+justinthelaw@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:02:29 -0400 Subject: [PATCH 7/9] Update GPU.md --- docs/GPU.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/GPU.md b/docs/GPU.md index 5a81351..4b8b203 100644 --- a/docs/GPU.md +++ b/docs/GPU.md @@ -160,7 +160,7 @@ UDS K3d's NVIDIA GPU support does not work on MacOS. The NVIDIA GPU Operator does not work on WSL2 as of version v24.3.0 (see [issue](https://github.com/NVIDIA/gpu-operator/issues/318)); however, the NVIDIA Device Plugin, by itself, does work as of version 0.15.0-rc1 (see [comment](https://github.com/NVIDIA/k8s-device-plugin/issues/332#issuecomment-1927997436)). -To get around this issue, the recommended course of action is to install UDS K3d without the `cuda` flavor, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so: +To get around this issue, the recommended course of action is to install UDS K3d with the `cuda` flavor, delete the NVIDIA GPU Operator deployment, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so: 1. Run `uds run default --set K3D_EXTRA_ARGS="--gpus=all"` or `uds zarf package deploy oci://defenseunicorns/uds-k3d:${PACKAGE_VERSION} --confirm --set K3D_EXTRA_ARGS="--gpus=all"` 2. Create an `nvidia-device-plugin.yaml` manifest like the one below, and a deploy it with `uds zarf tools kubectl apply -f nvidia-device-plugin.yaml` From b1174255dc0572046c33dc5e8ad57212ddbe8e34 Mon Sep 17 00:00:00 2001 From: Justin Law Date: Fri, 1 Nov 2024 14:54:29 -0400 Subject: [PATCH 8/9] add better docs for core and core-slim-dev --- docs/GPU.md | 138 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 96 insertions(+), 42 deletions(-) diff --git a/docs/GPU.md b/docs/GPU.md index 4b8b203..8caf257 100644 --- a/docs/GPU.md +++ b/docs/GPU.md @@ -2,48 +2,15 @@ UDS K3d comes with optional base images that provide GPU scheduling in the cluster to allow for GPU-accelerated workloads (e.g., LLMs). Currently, UDS K3d only supports NVIDIA CUDA-capable GPUs, with considerations for supporting AMD and other workloads in the future. -## NVIDIA +## Usage -### Pre-Requisites +The following usage steps use the `cuda` flavor of the UDS K3d package as an example of how to enable GPU access in a K3d cluster. -### NVIDIA Drivers +For troubleshooting during UDS K3d deployments, please see each flavor's specific instructions: -- Ensure that the proper [NVIDIA drivers](https://www.nvidia.com/download/index.aspx) are installed (>=525.60). -- Follow the [driver download](https://www.nvidia.com/download/index.aspx) by identifying your hardware from the provided list. - -### NVIDIA Container Toolkit - -- [Read the pre-requisites for installation and follow the instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) to download and install the NVIDIA container toolkit (>=1.14). -- After the successful installation off the toolkit, follow the [toolkit instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) to verify that your default Docker runtime is configured for NVIDIA: - - ```bash - nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json - ``` - -- Verify that `nvidia` is now a runtime available to the Docker daemon to use: - - ```bash - # the expected output should be similar to: `Runtimes: io.containerd.runc.v2 nvidia runc` - docker info | grep -i nvidia - ``` - -- [Try out a sample CUDA workload](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html) to ensure your Docker containers have access to the GPUs after configuration. -- (OPTIONAL) You can configure Docker to use the `nvidia` runtime by default by adding the `--set-as-default` flag during the container toolkit post-installation configuration step by running the following command: - - ```bash - nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json --set-as-default - ``` +1. [`cuda` flavor troubleshooting](#nvidia) -- (OPTIONAL) Verify that the default runtime is changed by running the following command: - - ```bash - # the expected output should be similar to: `Default Runtime: nvidia` - docker info | grep "Default Runtime" - ``` - -### Usage - -#### Local Build and Deployment +### Local Build and Deployment To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following: @@ -51,7 +18,7 @@ To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute t uds run default-cuda ``` -#### Remote Package Deployment +### Remote Package Deployment To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following: @@ -64,7 +31,7 @@ uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE -##### Additional Base Images +#### Additional Base Images This repository publishes several variations of the underlying K3d image and CUDA image so that it covers more compatibility cases (e.g., GPU driver versions, K3d versions, etc.). Please see the [published images](https://github.com/defenseunicorns/uds-k3d/pkgs/container/uds-k3d%2Fcuda-k3s) for all possible variations. @@ -78,7 +45,7 @@ uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="11.8.0-base-ubuntu22.04" ``` -#### Tests +### Tests This repository includes two CUDA workload tests that can be executed: @@ -87,7 +54,94 @@ uds run validate-cuda # device info query uds run validate-cuda --set CUDA_TEST="cuda-vector-add" # vector addition ``` -### Troubleshooting +### UDS Core + +Deploying UDS Core with a UDS K3d cluster capable of GPU support without building your own UDS bundle, that includes UDS K3d's GPU flavors and UDS Core, requires some extra argument in the UDS CLI. Below are examples of deploying the [full UDS Core](#core) and the [developer version of UDS Core](#core-slim-dev). + +### Core + +To deploy the full set of UDS Core services on top of UDS K3d `cuda`, you can run the following commands: + + + +```bash +export PACKAGE_VERSION=0.9.0 +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm +# fill-in with your desired UDS Core version and flavor +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds/core:${UDS_CORE_VERSION}-${UDS_CORE_FLAVOR} --confirm +``` + + + +### Core Slim Dev + +Since the slim development version of UDS Core is only published as a bundle, the `cuda` version of the UDS K3d Zarf package cannot be used directly; therefore, the K3d arguments and NVIDIA GPU operator deployment that are normally handled automatically within this [Zarf package](../zarf.yaml) must be done manually. + +To allow GPU access in a UDS Core slim development cluster, the base k3s-cuda image published by this repository must be passed into the bundle deployment command and a separate deployment of one of the following options: + +1. [NVIDIA Device Plugin](https://github.com/NVIDIA/k8s-device-plugin) +2. [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) + +To deploy the slim development set of UDS Core services on top of UDS K3d `cuda`, you can run the following commands: + +```bash +# fill-in with your desired UDS Core version +# fill-in with your desired k3s-CUDA image published by this repository +uds deploy k3d-core-slim-dev:${UDS_CORE_SLIM_DEV_VERSION} --set K3D_EXTRA_ARGS="--gpus=all --image=${K3S_CUDA_IMAGE}" --confirm + +# OPTION #1: use the NVIDIA Device Plugin from upstream - fill-in the desired version +uds zarf tools kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${NVIDIA_DEVICE_PLUGIN_VERSION}/deployments/static/nvidia-device-plugin.yml + +# OPTION #2: use the NVIDIA GPU Operator's helm repository with this UDS K3d's NVIDIA GPU Operator values file +# this options requires helm to be locally installed and for the aforementioned values file to be available +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +helm repo update +helm install --wait --generate-name \ + -n kube-system \ + --values values/nvidia-gpu-operator-values.yaml \ + nvidia/gpu-operator +``` + +## Troubleshooting + +### NVIDIA + +#### NVIDIA Pre-Requisites + +##### NVIDIA Drivers + +- Ensure that the proper [NVIDIA drivers](https://www.nvidia.com/download/index.aspx) are installed (>=525.60). +- Follow the [driver download](https://www.nvidia.com/download/index.aspx) by identifying your hardware from the provided list. + +##### NVIDIA Container Toolkit + +- [Read the pre-requisites for installation and follow the instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) to download and install the NVIDIA container toolkit (>=1.14). +- After the successful installation off the toolkit, follow the [toolkit instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) to verify that your default Docker runtime is configured for NVIDIA: + + ```bash + nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json + ``` + +- Verify that `nvidia` is now a runtime available to the Docker daemon to use: + + ```bash + # the expected output should be similar to: `Runtimes: io.containerd.runc.v2 nvidia runc` + docker info | grep -i nvidia + ``` + +- [Try out a sample CUDA workload](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html) to ensure your Docker containers have access to the GPUs after configuration. +- (OPTIONAL) You can configure Docker to use the `nvidia` runtime by default by adding the `--set-as-default` flag during the container toolkit post-installation configuration step by running the following command: + + ```bash + nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json --set-as-default + ``` + +- (OPTIONAL) Verify that the default runtime is changed by running the following command: + + ```bash + # the expected output should be similar to: `Default Runtime: nvidia` + docker info | grep "Default Runtime" + ``` #### NVML Errors or Missing CUDA Dependencies From 25eb6c6ae76be1bde7959f1f18cf5ef12f74e139 Mon Sep 17 00:00:00 2001 From: Justin Law <81255462+justinthelaw@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:21:58 -0400 Subject: [PATCH 9/9] use lfai nvidia device plugin values --- docs/GPU.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/GPU.md b/docs/GPU.md index 8caf257..90252f3 100644 --- a/docs/GPU.md +++ b/docs/GPU.md @@ -90,7 +90,7 @@ To deploy the slim development set of UDS Core services on top of UDS K3d `cuda` uds deploy k3d-core-slim-dev:${UDS_CORE_SLIM_DEV_VERSION} --set K3D_EXTRA_ARGS="--gpus=all --image=${K3S_CUDA_IMAGE}" --confirm # OPTION #1: use the NVIDIA Device Plugin from upstream - fill-in the desired version -uds zarf tools kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${NVIDIA_DEVICE_PLUGIN_VERSION}/deployments/static/nvidia-device-plugin.yml +uds zarf tools kubectl create -f https://raw.githubusercontent.com/defenseunicorns/leapfrogai/refs/heads/main/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml # OPTION #2: use the NVIDIA GPU Operator's helm repository with this UDS K3d's NVIDIA GPU Operator values file # this options requires helm to be locally installed and for the aforementioned values file to be available