From b4345bcf2ff7d1fcec082254286affe4d4423443 Mon Sep 17 00:00:00 2001
From: Justin Law <justin.law@defenseunicorns.com>
Date: Wed, 9 Oct 2024 11:21:21 -0400
Subject: [PATCH 1/9] nvidia cuda base image and package

---
 .github/.gitkeep                       |   0
 .github/workflows/build-test.yaml      |  28 ++-
 .github/workflows/publish-image.yaml   |  43 ----
 .github/workflows/tag-and-release.yaml |  73 +++++--
 README.md                              |  13 +-
 docker/Dockerfile                      |   5 -
 docker/Dockerfile.gpu                  |  35 +++
 docs/GPU.md                            | 230 ++++++++++++++++++++
 release-please-config.json             |   7 +-
 tasks.yaml                             |  87 ++++++--
 tests/cuda-device-query.yaml           |  21 ++
 tests/cuda-vector-add.yaml             |  21 ++
 values/nvidia-gpu-operator-values.yaml | 284 +++++++++++++++++++++++++
 zarf.yaml                              | 104 ++++++++-
 14 files changed, 841 insertions(+), 110 deletions(-)
 delete mode 100644 .github/.gitkeep
 delete mode 100644 .github/workflows/publish-image.yaml
 delete mode 100644 docker/Dockerfile
 create mode 100644 docker/Dockerfile.gpu
 create mode 100644 docs/GPU.md
 create mode 100644 tests/cuda-device-query.yaml
 create mode 100644 tests/cuda-vector-add.yaml
 create mode 100644 values/nvidia-gpu-operator-values.yaml

diff --git a/.github/.gitkeep b/.github/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml
index 4424c3c..67e7dd2 100644
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -7,35 +7,31 @@ on:
       - "docs/**"
       - "CODEOWNERS"
 
-permissions:
-  id-token: write
-  contents: read
-
 jobs:
   test-clean-install:
     runs-on: ubuntu-latest
+
+    permissions:
+      id-token: write
+      contents: read
+
     strategy:
       matrix:
-        image: ["rancher/k3s"]
-        version: ["v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"]
+        version:
+          ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"]
 
     steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4
 
       - name: Setup UDS
-        if: always()
         uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1
         with:
-          username: ${{secrets.IRON_BANK_ROBOT_USERNAME}}
-          password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}}
-
-      # Step is not currently being used, could be uncommented if custom image support is needed in the future
-      # - name: Build the custom k3s image
-      #   if: ${{matrix.image}} != "rancher/k3s"
-      #   run: uds run build-image --set VERSION=${{matrix.version}} --no-progress
+          registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
+          registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
+          ghToken: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Create and deploy the uds-k3d package
-        run: uds run --set IMAGE_NAME=${{matrix.image}} --set VERSION=${{matrix.version}} --no-progress
+        run: uds run --set K3S_IMAGE_VERSION=${{matrix.version}} --no-progress
 
       - name: Validate uds-k3d package
         run: uds run validate --no-progress
diff --git a/.github/workflows/publish-image.yaml b/.github/workflows/publish-image.yaml
deleted file mode 100644
index 64ce092..0000000
--- a/.github/workflows/publish-image.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: Publish k3s image
-
-# Workflow is not currently being used, switched to a manual trigger only
-on: workflow_dispatch
-  # push:
-  #   branches:
-  #     - main
-  #   paths:
-  #     - docker/**
-  #     - .github/workflows/publish-image.yaml
-
-jobs:
-  publish-k3s-image:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        version: ["v1.2.3-k3s1"] # Placeholder
-
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4
-
-      - uses: docker/setup-buildx-action@8026d2bc3645ea78b0d2544766a1225eb5691f89 # v3.7.0
-
-      - name: Setup UDS
-        if: always()
-        uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1
-        with:
-          username: ${{secrets.IRON_BANK_ROBOT_USERNAME}}
-          password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}}
-
-      - name: Login to GHCR
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3
-        with:
-          registry: ghcr.io
-          username: dummy
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Publish the custom k3s image
-        run: uds run publish-image --set VERSION=${{matrix.version}} --no-progress
diff --git a/.github/workflows/tag-and-release.yaml b/.github/workflows/tag-and-release.yaml
index d6f2a26..ae0833c 100644
--- a/.github/workflows/tag-and-release.yaml
+++ b/.github/workflows/tag-and-release.yaml
@@ -7,8 +7,10 @@ on:
 
 jobs:
   tag-new-version:
-    permissions: write-all
     runs-on: ubuntu-latest
+
+    permissions: write-all
+
     outputs:
       release_created: ${{ steps.release-flag.outputs.release_created }}
     steps:
@@ -25,28 +27,73 @@ jobs:
     if: ${{ needs.tag-new-version.outputs.release_created == 'true'}}
     runs-on: ubuntu-latest
 
+    strategy:
+      matrix:
+        k3s_image_repository: ["rancher/k3s"]
+        k3s_image_version:
+          ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"]
+
     permissions:
       contents: read
       packages: write
 
     steps:
-      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4
 
       - name: Setup UDS
-        if: always()
         uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1
         with:
-          username: ${{secrets.IRON_BANK_ROBOT_USERNAME}}
-          password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}}
+          registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
+          registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
+          ghToken: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Publish the base capability
+        run: |
+          uds zarf package create --confirm -a arm64 -o oci://ghcr.io/defenseunicorns/packages \
+            --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \
+            --set K3S_IMAGE_VERSION=${{ matrix.k3s_image_version }}
+
+          uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages \
+            --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \
+            --set K3S_IMAGE_VERSION=${{ matrix.k3s_image_version }}
+
+  publish-uds-cuda-package:
+    needs: tag-new-version
+    if: ${{ needs.tag-new-version.outputs.release_created == 'true'}}
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        k3s_image_repository: ["rancher/k3s"]
+        k3s_image_version:
+          ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"]
+        cuda_image_version:
+          [
+            11.8.0-base-ubuntu22.04,
+            12.1.0-base-ubuntu22.04,
+            12.5.0-base-ubuntu22.04,
+          ]
+
+    steps:
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4
 
-      - name: Login to GHCR
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3
+      - uses: docker/setup-buildx-action@8026d2bc3645ea78b0d2544766a1225eb5691f89 # v3.7.0
+
+      - name: Setup UDS
+        uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1
         with:
-          registry: ghcr.io
-          username: dummy
-          password: ${{ secrets.GITHUB_TOKEN }}
+          registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
+          registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
+          ghToken: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Publish the CUDA K3s image
+        run: |
+          uds run publish-cuda-image \
+            --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \
+            --set K3S_IMAGE_VERSION="${{ matrix.k3s_image_version }}" \
+            --set CUDA_IMAGE_VERSION="${{ matrix.cuda_image_version }}" \
+            --no-progress
 
-      - name: Publish the capability
+      - name: Publish the CUDA capability
         run: |
-          uds zarf package create --confirm -a arm64 -o oci://ghcr.io/defenseunicorns/packages
-          uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages
+          uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages --flavor cuda
diff --git a/README.md b/README.md
index ab4b3d7..a62108b 100644
--- a/README.md
+++ b/README.md
@@ -16,16 +16,17 @@ sudo ssh -N -L 80:localhost:80 -L 443:localhost:443 -L 6550:localhost:6550 <your
 > [!NOTE]
 > UDS K3d intentionally does not address airgap concerns for K3d or the load balancer logic deployed in this package. This allows running `zarf init` or deploying a Zarf Init Package via a UDS Bundle after the UDS K3d environment is deployed.
 
-## Prerequisites
+## Pre-Requisites
 
 - [UDS CLI](https://github.com/defenseunicorns/uds-cli/blob/main/README.md#install) & [K3d](https://k3d.io/#installation) using the versions specified in the [uds-common repo](https://github.com/defenseunicorns/uds-common/blob/main/README.md#supported-tool-versions)
 - [Docker](https://docs.docker.com/get-docker/) or [Podman](https://podman.io/getting-started/installation) for running K3d
+- See the [GPU Configuration](./docs/GPU.md) information for more details on enabling NVIDIA GPU support within the cluster
 
 ## Deploy
 
 <!-- x-release-please-start-version -->
 
-`uds zarf package deploy oci://defenseunicorns/uds-k3d:0.9.0`
+`uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:0.11.2`
 
 <!-- x-release-please-end -->
 
@@ -53,13 +54,13 @@ k3d cluster start uds
 
 ## Additional Info
 
-You can set extra k3d args by setting the deploy-time ZARF_VAR_K3D_EXTRA_ARGS. See below `zarf-config.yaml` example k3d args:
+You can set extra K3d arguments by setting the deploy-time `ZARF_VAR_K3D_EXTRA_ARGS`. See below `zarf-config.yaml` example below for K3d args examples:
 
 ```yaml
 package:
   deploy:
     set:
-      k3d_extra_args: "--k3s-arg --gpus=1 --k3s-arg --<arg2>=<value>"
+      k3d_extra_args: --k3s-arg "--<arg2>=<value>@server:*" --gpus=all
 ```
 
 ### Configure MinIO
@@ -69,3 +70,7 @@ package:
 ### DNS Assumptions
 
 - [DNS Assumptions](docs/DNS.md)
+
+### Enabling GPU Support
+
+- [GPU Workload Configuration](docs/GPU.md)
diff --git a/docker/Dockerfile b/docker/Dockerfile
deleted file mode 100644
index 8c4797f..0000000
--- a/docker/Dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-ARG K3S_TAG="v1.2.3-k3s1" # Placeholder
-
-FROM rancher/k3s:$K3S_TAG as k3s
-
-# Do custom image things
diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
new file mode 100644
index 0000000..1e7339b
--- /dev/null
+++ b/docker/Dockerfile.gpu
@@ -0,0 +1,35 @@
+ARG K3S_REPOSITORY="rancher/k3s"
+ARG K3S_TAG="v1.30.4-k3s1"
+ARG CUDA_TAG="12.1.0-base-ubuntu22.04"
+
+FROM $K3S_REPOSITORY:$K3S_TAG AS k3s
+
+FROM nvidia/cuda:$CUDA_TAG
+
+# Install the NVIDIA container toolkit
+RUN apt-get update && \
+    apt-get install -y curl && \
+    curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \
+    curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+    tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+    apt-get update && \
+    apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux && \
+    nvidia-ctk runtime configure --runtime=containerd
+
+COPY --from=k3s / / --exclude=/bin/
+COPY --from=k3s /bin /bin
+
+VOLUME /var/lib/kubelet
+VOLUME /var/lib/rancher/k3s
+VOLUME /var/lib/cni
+VOLUME /var/log
+
+# Resolve fsnotify issues
+RUN sysctl -w fs.inotify.max_user_watches=100000 && \
+    sysctl -w fs.inotify.max_user_instances=100000
+
+ENV PATH="$PATH:/bin/aux"
+
+ENTRYPOINT ["/bin/k3s"]
+CMD ["agent"]
diff --git a/docs/GPU.md b/docs/GPU.md
new file mode 100644
index 0000000..8fabfa5
--- /dev/null
+++ b/docs/GPU.md
@@ -0,0 +1,230 @@
+# GPU
+
+UDS K3d comes with optional base images that provide GPU scheduling in the cluster to allow for GPU-accelerated workloads (e.g., LLMs). Currently, UDS K3d only supports NVIDIA CUDA-capable GPUs, with considerations for supporting AMD and other workloads in the future.
+
+## NVIDIA
+
+### Pre-Requisites
+
+### NVIDIA Drivers
+
+- Ensure that the proper [NVIDIA drivers](https://www.nvidia.com/download/index.aspx) are installed (>=525.60).
+- Follow the [driver download](https://www.nvidia.com/download/index.aspx) by identifying your hardware from the provided list.
+
+### NVIDIA Container Toolkit
+
+- [Read the pre-requisites for installation and follow the instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) to download and install the NVIDIA container toolkit (>=1.14).
+- After the successful installation off the toolkit, follow the [toolkit instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) to verify that your default Docker runtime is configured for NVIDIA:
+
+  ```bash
+  nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json
+  ```
+
+- Verify that `nvidia` is now a runtime available to the Docker daemon to use:
+
+  ```bash
+  # the expected output should be similar to: `Runtimes: io.containerd.runc.v2 nvidia runc`
+  docker info | grep -i nvidia
+  ```
+
+- [Try out a sample CUDA workload](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html) to ensure your Docker containers have access to the GPUs after configuration.
+- (OPTIONAL) You can configure Docker to use the `nvidia` runtime by default by adding the `--set-as-default` flag during the container toolkit post-installation configuration step by running the following command:
+
+  ```bash
+  nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json --set-as-default
+  ```
+
+- (OPTIONAL) Verify that the default runtime is changed by running the following command:
+
+  ```bash
+  # the expected output should be similar to: `Default Runtime: nvidia`
+  docker info | grep "Default Runtime"
+  ```
+
+### Usage
+
+#### Local Build and Deployment
+
+To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following:
+
+```bash
+uds run default-cuda
+```
+
+#### Remote Package Deployment
+
+To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following:
+
+<!-- x-release-please-start-version -->
+
+```bash
+export PACKAGE_VERSION=0.11.2
+uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm
+```
+
+<!-- x-release-please-end -->
+
+##### Additional Base Images
+
+This repository publishes several variations of the underlying K3d image and CUDA image so that it covers more compatibility cases (e.g., GPU driver versions, K3d versions, etc.). Please see the [published images](https://github.com/defenseunicorns/uds-k3d/pkgs/container/uds-k3d%2Fcuda-k3s) for all possible variations.
+
+Below are some examples of setting these variables to choose a different variation at deploy-time:
+
+```bash
+uds run default-cuda --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="12.1.0-base-ubuntu22.04"
+# OR
+uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.31.0-k3s1" --set CUDA_IMAGE_VERSION="12.5.0-base-ubuntu22.04"
+# OR
+uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="11.8.0-base-ubuntu22.04"
+```
+
+#### Tests
+
+This repository includes two CUDA workload tests that can be executed:
+
+```bash
+uds run validate-cuda # device info query
+uds run validate-cuda --set CUDA_TEST="cuda-vector-add" # vector addition
+```
+
+### Troubleshooting
+
+#### NVML Errors or Missing CUDA Dependencies
+
+None of the following should ever error or return `unknown version`:
+
+1. Check if your NVIDIA GPU drivers are installed:
+
+    ```bash
+    nvidia-smi
+    ```
+
+2. Check the version of your NVIDIA Container Toolkit:
+
+    ```bash
+    nvidia-ctk --version
+    ```
+
+3. Try looking at your Docker runtime information and make sure the following returns with several lines of information:
+
+    ```bash
+    docker info | grep "nvidia"
+    ```
+
+4. Try running a CUDA sample test in the cluster: [CUDA Vector Add](https://github.com/NVIDIA/k8s-device-plugin/blob/a6a7ce12d28618d343c251ca0941222d7b8a46d3/README.md?plain=1#L145).
+
+#### Memory Errors or Process Locks
+
+If you are, not deploying a fresh cluster or fresh packages (e.g., a GPU workload is already deployed) or you have a GPU that has other workloads on it (e.g., display), then there may not be enough resources to offload the workloads to the NVIDIA GPU.
+
+1. To see what host-level processes are on your NVIDIA GPU(s) run the following:
+
+    ```bash
+    nvidia-smi
+    ```
+
+2. To check which pods are scheduled with GPU resources in particular, you can run the following `uds zarf tools yq` command:
+
+    ```bash
+    uds zarf tools kubectl get pods \
+    --all-namespaces \
+    --output=yaml \
+    | uds zarf tools yq eval -o=json '
+      ["Pod", "Namespace", "Container", "GPU"] as $header |
+      [$header] + [
+        .items[] |
+        .metadata as $metadata |
+        .spec.containers[] |
+        select(.resources.requests["nvidia.com/gpu"]) |
+        [
+          $metadata.name,
+          $metadata.namespace,
+          .name,
+          .resources.requests["nvidia.com/gpu"]
+        ]
+      ]' - \
+    | uds zarf tools yq -r '(.[0] | @tsv), (.[1:][] | @tsv)' \
+    | column -t -s $'\t'
+    ```
+
+When you reinstall or start a new GPU-dependent pod, the previous PID (process) on the GPU may not have been flushed yet.
+
+1. Scale the previous GPU-dependent pod deployment down to 0, as the current `RollingUpdate` strategy for vLLM relies on back-up/secondary GPUs to be available for a graceful turnover
+2. Use `nvidia-smi` to check if the process has been flushed upon Pod termination BEFORE you deploy a new GPU-dependent pod, and if not, use `kill -9 <PID>` to manually flush the process
+
+#### MacOS
+
+UDS K3d's NVIDIA GPU support does not work on MacOS.
+
+#### Windows (WSL2)
+
+The NVIDIA GPU Operator does not work on WSL2 as of version v24.3.0 (see [issue](https://github.com/NVIDIA/gpu-operator/issues/318)); however, the NVIDIA Device Plugin, by itself, does work as of version 0.15.0-rc1 (see [comment](https://github.com/NVIDIA/k8s-device-plugin/issues/332#issuecomment-1927997436)).
+
+To get around this issue, the recommended course of action is to install UDS K3d without the `cuda` flavor, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so:
+
+1. Run `uds run default --set K3D_EXTRA_ARGS="--gpus=all"` or `uds zarf package deploy oci://defenseunicorns/uds-k3d:${PACKAGE_VERSION} --confirm --set K3D_EXTRA_ARGS="--gpus=all"`
+2. Create an `nvidia-device-plugin.yaml` manifest like the one below, and a deploy it with `uds zarf tools kubectl apply -f nvidia-device-plugin.yaml`
+
+  ```yaml
+  apiVersion: node.k8s.io/v1
+  kind: RuntimeClass
+  metadata:
+    name: nvidia
+  handler: nvidia
+  ---
+  apiVersion: apps/v1
+  kind: DaemonSet
+  metadata:
+    name: nvidia-device-plugin-daemonset
+    namespace: kube-system
+  spec:
+    selector:
+      matchLabels:
+        name: nvidia-device-plugin-daemonset
+    updateStrategy:
+      type: RollingUpdate
+    template:
+      metadata:
+        labels:
+          name: nvidia-device-plugin-daemonset
+      spec:
+        runtimeClassName: nvidia # Explicitly request the runtime
+        tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+        # Mark this pod as a critical add-on; when enabled, the critical add-on
+        # scheduler reserves resources for critical add-on pods so that they can
+        # be rescheduled after a failure.
+        # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+        priorityClassName: "system-node-critical"
+        containers:
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
+          name: nvidia-device-plugin-ctr
+          env:
+            - name: PASS_DEVICE_SPECS
+              value: "true"
+            - name: FAIL_ON_INIT_ERROR
+              value: "true"
+            - name: DEVICE_LIST_STRATEGY
+              value: envvar
+            - name: DEVICE_ID_STRATEGY
+              value: uuid
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: all
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: compute,utility
+            - name: MPS_ROOT
+              value: /run/nvidia/mps
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+          volumeMounts:
+          - name: device-plugin
+            mountPath: /var/lib/kubelet/device-plugins
+        volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+  ```
diff --git a/release-please-config.json b/release-please-config.json
index 85c883d..97e8e28 100644
--- a/release-please-config.json
+++ b/release-please-config.json
@@ -10,7 +10,12 @@
         { "type": "chore", "section": "Miscellaneous", "hidden": false }
       ],
       "versioning": "default",
-      "extra-files": ["README.md", "zarf.yaml", "chart/Chart.yaml"]
+      "extra-files": [
+        "README.md",
+        "zarf.yaml",
+        "chart/Chart.yaml",
+        "docs/GPU.md"
+      ]
     }
   }
 }
diff --git a/tasks.yaml b/tasks.yaml
index 00fb80c..3269a5d 100644
--- a/tasks.yaml
+++ b/tasks.yaml
@@ -1,38 +1,63 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.16.0/tasks.schema.json
+
 variables:
-  - name: VERSION
-    default: "v1.30.4-k3s1"
-  - name: IMAGE_NAME
+  - name: K3S_IMAGE_REPOSITORY
     default: "rancher/k3s"
+  - name: K3S_IMAGE_VERSION
+    default: "v1.30.4-k3s1"
+  - name: CUDA_IMAGE_VERSION
+    default: "12.1.0-base-ubuntu22.04"
   - name: K3D_EXTRA_ARGS
     default: ""
   - name: NGINX_EXTRA_PORTS
     default: "[]"
+  - name: CUDA_TEST
+    default: "cuda-device-query"
 
 tasks:
   - name: default
     description: "Build and deploy uds-k3d"
     actions:
-      - description: "Build UDS K3d package"
-        cmd: "uds zarf package create --confirm --no-progress"
+      - description: "Build the uds-k3d package"
+        cmd: "rm -rf zarf-package-uds-k3d-*.tar.zst && uds zarf package create --confirm --no-progress"
 
-      - description: "Deploy UDS K3d package"
+      - description: "Deploy the uds-k3d package"
         cmd: |
           uds zarf package deploy zarf-package-uds-k3d-*.tar.zst \
-            --set K3D_IMAGE=${IMAGE_NAME}:${VERSION} \
+            --set K3S_IMAGE_REPOSITORY=${K3S_IMAGE_REPOSITORY} \
+            --set K3S_IMAGE_VERSION=${K3S_IMAGE_VERSION} \
+            --set K3D_EXTRA_ARGS="${K3D_EXTRA_ARGS}" \
+            --set NGINX_EXTRA_PORTS="${NGINX_EXTRA_PORTS}" \
+            --no-progress --confirm
+
+  - name: default-cuda
+    description: "Build and deploy uds-k3d with CUDA support"
+    actions:
+      - description: "Build the uds-k3d CUDA package"
+        cmd: "rm -rf zarf-package-uds-k3d-*.tar.zst && uds zarf package create --flavor cuda --confirm --no-progress"
+
+      - description: "Build the k3s-cuda image locally"
+        task: build-cuda-image
+
+      - description: "Deploy the uds-k3d CUDA package"
+        cmd: |
+          uds zarf package deploy zarf-package-uds-k3d-*.tar.zst \
+            --set K3S_IMAGE_VERSION=${K3S_IMAGE_VERSION} \
+            --set CUDA_IMAGE_VERSION=${CUDA_IMAGE_VERSION} \
             --set K3D_EXTRA_ARGS="${K3D_EXTRA_ARGS}" \
             --set NGINX_EXTRA_PORTS="${NGINX_EXTRA_PORTS}" \
             --no-progress --confirm
 
   - name: validate
     actions:
-      - description: Validate coredns is up
+      - description: "Validate CoreDNS is up"
         wait:
           cluster:
             kind: Pod
             name: "k8s-app=kube-dns"
             namespace: kube-system
             condition: Ready
-      - description: Validate coredns is resolving *.uds.dev internally
+      - description: "Validate CoreDNS is resolving *.uds.dev internally"
         cmd: |
           set -e
           FOO_IP=$(uds zarf tools kubectl run dig-test --image=arunvelsriram/utils -q --restart=Never --rm -i -- dig +short foo.uds.dev)
@@ -42,19 +67,45 @@ tasks:
             echo "CoreDNS patch failed, foo.uds.dev is resolving to 127.0.0.1"
             exit 1
           fi
-      - description: Validate zarf init
+      - description: "Validate zarf init"
         cmd: |
           set -e
           uds zarf tools download-init --no-progress
           # Test zarf init due to containerd issue - https://github.com/defenseunicorns/zarf/issues/592
           uds zarf init --confirm --no-progress
 
-  # - name: build-image
-  #   actions:
-  #     - description: Build the custom k3s image
-  #       cmd: docker build -t ${IMAGE_NAME}:${VERSION} --build-arg K3S_TAG=${VERSION} docker/
+  - name: validate-cuda
+    description: "Run a CUDA test workload in-cluster"
+    actions:
+      - description: "Deploy the device query test pod to the cluster"
+        cmd: |
+          uds zarf tools kubectl apply -f tests/${CUDA_TEST}.yaml
+      - description: "Await test completion and then display the test results"
+        cmd: |
+          uds zarf tools wait-for Pod cuda-test-pod '{.status.phase}'=Succeeded -n default --no-progress --timeout 120s
+          uds zarf tools kubectl logs -l app=cuda-test-pod -n default
+      - description: "Remove the completed test pod"
+        cmd: |
+          uds zarf tools kubectl delete Pod cuda-test-pod
+
+  - name: build-cuda-image
+    actions:
+      - description: "Build the CUDA K3s image"
+        cmd: |
+          docker build -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \
+            --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \
+            --build-arg K3S_TAG=${K3S_IMAGE_VERSION} \
+            --build-arg CUDA_TAG=${CUDA_IMAGE_VERSION} \
+            -f docker/Dockerfile.gpu . -q
 
-  # - name: publish-image
-  #   actions:
-  #     - description: Publish the custom k3s image
-  #       cmd: docker buildx build --push --platform linux/arm64/v8,linux/amd64 --tag ${IMAGE_NAME}:${VERSION} docker
+  - name: publish-cuda-image
+    actions:
+      - description: "Publish the CUDA K3s image"
+        cmd: |
+          docker buildx build --push \
+            --platform linux/amd64 \
+            --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \
+            --build-arg K3S_TAG=${K3S_IMAGE_VERSION} \
+            --build-arg CUDA_TAG=${CUDA_IMAGE_VERSION} \
+            -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \
+            -f docker/Dockerfile.gpu . -q
diff --git a/tests/cuda-device-query.yaml b/tests/cuda-device-query.yaml
new file mode 100644
index 0000000..b1b95da
--- /dev/null
+++ b/tests/cuda-device-query.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: cuda-test-pod
+  labels:
+    app: cuda-test-pod
+spec:
+  runtimeClassName: nvidia
+  restartPolicy: Never
+  containers:
+    - name: cuda-container
+      image: nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0-ubuntu22.04
+      resources:
+        limits:
+          nvidia.com/gpu: "1" # requesting 1 GPU
+          cpu: "1"
+          memory: 0.5Gi
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
diff --git a/tests/cuda-vector-add.yaml b/tests/cuda-vector-add.yaml
new file mode 100644
index 0000000..a62ba98
--- /dev/null
+++ b/tests/cuda-vector-add.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: cuda-test-pod
+  labels:
+    app: cuda-test-pod
+spec:
+  runtimeClassName: nvidia
+  restartPolicy: Never
+  containers:
+    - name: cuda-container
+      image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04
+      resources:
+        limits:
+          nvidia.com/gpu: "1" # requesting 1 GPU
+          cpu: "1"
+          memory: 0.5Gi
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
diff --git a/values/nvidia-gpu-operator-values.yaml b/values/nvidia-gpu-operator-values.yaml
new file mode 100644
index 0000000..a35fe71
--- /dev/null
+++ b/values/nvidia-gpu-operator-values.yaml
@@ -0,0 +1,284 @@
+# See the NVIDIA GPU Operator repository for more details on available values
+# https://github.com/NVIDIA/gpu-operator/blob/main/deployments/gpu-operator/values.yaml
+
+platform:
+  openshift: false
+
+nfd:
+  # usually enabled by default, but choose to use external NFD from IronBank
+  enabled: true
+  nodefeaturerules: false
+
+psa:
+  enabled: false
+
+cdi:
+  enabled: false
+
+sandboxWorkloads:
+  enabled: false
+
+hostPaths:
+  # rootFS represents the path to the root filesystem of the host.
+  # This is used by components that need to interact with the host filesystem
+  # and as such this must be a chroot-able filesystem.
+  # Examples include the MIG Manager and Toolkit Container which may need to
+  # stop, start, or restart systemd services
+  rootFS: "/"
+
+  # driverInstallDir represents the root at which driver files including libraries,
+  # config files, and executables can be found.
+  driverInstallDir: "/run/nvidia/driver"
+
+daemonsets:
+  labels: {}
+  annotations: {}
+  priorityClassName: system-node-critical
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
+  # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands
+  # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions
+  updateStrategy: "RollingUpdate"
+  # configuration for controlling rolling update of GPU Operands
+  rollingUpdate:
+    # maximum number of nodes to simultaneously apply pod updates on.
+    # can be specified either as number or percentage of nodes. Default 1.
+    maxUnavailable: "1"
+
+validator:
+  imagePullPolicy: IfNotPresent
+  imagePullSecrets: []
+  env: []
+  args: []
+  resources: {}
+  plugin:
+    env:
+      - name: WITH_WORKLOAD
+        value: "false"
+  driver:
+    env:
+      - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
+        value: "true"
+      - name: NVIDIA_VISIBLE_DEVICES
+        value: all
+        # Default value of "all" causes the "display" capability to also be considered;
+        # however, not all hosts have or allow that capability, causing the daemonset to fail
+      - name: NVIDIA_DRIVER_CAPABILITIES
+        value: compute,utility
+
+operator:
+  imagePullPolicy: IfNotPresent
+  imagePullSecrets: []
+  priorityClassName: system-node-critical
+  # Can be set to `containerd`, `docker`, etc.
+  defaultRuntime: docker
+  runtimeClass: nvidia
+  use_ocp_driver_toolkit: false
+  # cleanup CRD on chart un-install
+  cleanupCRD: false
+  # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
+  # to be passed during helm upgrade.
+  upgradeCRD: false
+  initContainer:
+    image: cuda
+
+  tolerations:
+    - key: "node-role.kubernetes.io/master"
+      operator: "Equal"
+      value: ""
+      effect: "NoSchedule"
+    - key: "node-role.kubernetes.io/control-plane"
+      operator: "Equal"
+      value: ""
+      effect: "NoSchedule"
+  annotations:
+    openshift.io/scc: restricted-readonly
+  affinity:
+    nodeAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+        - weight: 1
+          preference:
+            matchExpressions:
+              - key: "node-role.kubernetes.io/master"
+                operator: In
+                values: [""]
+        - weight: 1
+          preference:
+            matchExpressions:
+              - key: "node-role.kubernetes.io/control-plane"
+                operator: In
+                values: [""]
+  logging:
+    # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano')
+    timeEncoding: epoch
+    # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity
+    level: info
+    # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn)
+    # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error)
+    develMode: true
+  resources:
+    limits:
+      cpu: 500m
+      memory: 350Mi
+    requests:
+      cpu: 200m
+      memory: 100Mi
+
+mig:
+  strategy: single
+
+driver:
+  # usually enabled by default, depends on deployment environment
+  enabled: false
+
+toolkit:
+  # usually enabled by default, depends on deployment environment
+  enabled: false
+
+devicePlugin:
+  enabled: true
+  imagePullPolicy: IfNotPresent
+  imagePullSecrets: []
+  args: []
+  env:
+    - name: PASS_DEVICE_SPECS
+      value: "true"
+    - name: FAIL_ON_INIT_ERROR
+      value: "true"
+    - name: DEVICE_LIST_STRATEGY
+      value: envvar
+    - name: DEVICE_ID_STRATEGY
+      value: uuid
+    - name: NVIDIA_VISIBLE_DEVICES
+      value: all
+      # Default value of "all" causes the "display" capability to also be considered;
+      # however, not all hosts have or allow that capability, causing the daemonset to fail
+    - name: NVIDIA_DRIVER_CAPABILITIES
+      value: compute,utility
+  resources: {}
+  config:
+    # Create a ConfigMap (default: false)
+    create: false
+    # ConfigMap name (either exiting or to create a new one with create=true above)
+    name: ""
+    # Default config name within the ConfigMap
+    default: ""
+    # Data section for the ConfigMap to create (i.e only applies when create=true)
+    data: {}
+  # MPS related configuration for the plugin
+  mps:
+    # MPS root path on the host
+    root: "/run/nvidia/mps"
+
+# standalone dcgm host engine
+dcgm:
+  # disabled by default to use embedded nv-host engine by exporter
+  enabled: false
+
+dcgmExporter:
+  enabled: true
+  imagePullPolicy: IfNotPresent
+  env:
+    - name: DCGM_EXPORTER_LISTEN
+      value: ":9400"
+    - name: DCGM_EXPORTER_KUBERNETES
+      value: "true"
+    - name: DCGM_EXPORTER_COLLECTORS
+      value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
+  resources: {}
+  serviceMonitor:
+    enabled: false
+    interval: 15s
+    honorLabels: false
+    additionalLabels: {}
+    relabelings: []
+
+gfd:
+  enabled: true
+  imagePullPolicy: IfNotPresent
+  imagePullSecrets: []
+  env:
+    - name: GFD_SLEEP_INTERVAL
+      value: 60s
+    - name: GFD_FAIL_ON_INIT_ERROR
+      value: "true"
+  resources: {}
+
+migManager:
+  # usually enabled by default, depends on deployment environment
+  enabled: false
+
+nodeStatusExporter:
+  enabled: false
+
+gds:
+  enabled: false
+
+gdrcopy:
+  enabled: false
+
+vgpuManager:
+  enabled: false
+
+vgpuDeviceManager:
+  # usually enabled by default, depends on deployment environment
+  enabled: false
+
+vfioManager:
+  # usually enabled by default, depends on deployment environment
+  enabled: false
+
+kataManager:
+  enabled: false
+
+sandboxDevicePlugin:
+  # usually enabled by default, depends on deployment environment
+  enabled: false
+
+ccManager:
+  enabled: false
+
+node-feature-discovery:
+  enableNodeFeatureApi: true
+  gc:
+    enable: true
+    replicaCount: 1
+    serviceAccount:
+      name: node-feature-discovery
+      create: false
+  worker:
+    serviceAccount:
+      name: node-feature-discovery
+      # disable creation to avoid duplicate serviceaccount creation by master spec below
+      create: false
+    tolerations:
+      - key: "node-role.kubernetes.io/master"
+        operator: "Equal"
+        value: ""
+        effect: "NoSchedule"
+      - key: "node-role.kubernetes.io/control-plane"
+        operator: "Equal"
+        value: ""
+        effect: "NoSchedule"
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+    config:
+      sources:
+        pci:
+          deviceClassWhitelist:
+            - "02"
+            - "0200"
+            - "0207"
+            - "0300"
+            - "0302"
+          deviceLabelFields:
+            - vendor
+  master:
+    serviceAccount:
+      name: node-feature-discovery
+      create: true
+    config:
+      extraLabelNs: ["nvidia.com"]
diff --git a/zarf.yaml b/zarf.yaml
index 9a9d19f..2abfdca 100644
--- a/zarf.yaml
+++ b/zarf.yaml
@@ -4,7 +4,7 @@ kind: ZarfPackageConfig
 metadata:
   name: uds-k3d
   description: "UDS K3d Cluster Setup. WARNING: This will destroy the cluster if it already exists."
-  url: https://github.com/defenseunicorns/uds-k3d
+  url: https://github.com/justinthelaw/uds-k3d
   yolo: true
   # x-release-please-start-version
   version: "0.9.0"
@@ -15,9 +15,21 @@ variables:
     description: "Name of the cluster"
     default: "uds"
 
-  - name: K3D_IMAGE
-    description: "K3d image to use"
-    default: "rancher/k3s:v1.30.4-k3s1"
+  - name: K3S_IMAGE_REPOSITORY
+    description: "K3s image repository to use"
+    default: "rancher/k3s"
+
+  - name: K3S_IMAGE_VERSION
+    description: "K3d image version to use"
+    default: "v1.30.4-k3s1"
+
+  - name: CUDA_IMAGE_VERSION
+    description: "CUDA image to use"
+    default: "12.1.0-base-ubuntu22.04"
+
+  - name: NUMBER_OF_GPUS
+    description: "Number of GPUs to passthrough to the K3D cluster"
+    default: "all"
 
   - name: K3D_EXTRA_ARGS
     description: "Optionally pass k3d arguments to the default"
@@ -37,6 +49,43 @@ components:
           - cmd: k3d cluster delete ${ZARF_VAR_CLUSTER_NAME}
             description: "Destroy the cluster"
 
+  - name: set-k3d-image
+    required: true
+    description: "Set the K3s base image"
+    actions:
+      onDeploy:
+        before:
+          - cmd: |
+              echo "${ZARF_VAR_K3S_IMAGE_REPOSITORY}:${ZARF_VAR_K3S_IMAGE_VERSION}"
+            setVariables:
+              - name: K3D_IMAGE
+
+  - name: inject-cuda-image
+    required: true
+    only:
+      flavor: cuda
+    description: "Overwrites the K3s base image variable to be the CUDA K3s image"
+    actions:
+      onDeploy:
+        before:
+          - cmd: |
+              echo "ghcr.io/justinthelaw/uds-k3d/cuda-k3s:${ZARF_VAR_K3S_IMAGE_VERSION}-cuda-${ZARF_VAR_CUDA_IMAGE_VERSION}"
+            setVariables:
+              - name: K3D_IMAGE
+
+  - name: expose-gpus
+    required: true
+    only:
+      flavor: cuda
+    description: "Adds the extra K3d argument for exposing host GPUs to the cluster"
+    actions:
+      onDeploy:
+        before:
+          - cmd: |
+              echo "${ZARF_VAR_K3D_EXTRA_ARGS} --gpus=${ZARF_VAR_NUMBER_OF_GPUS}"
+            setVariables:
+              - name: K3D_EXTRA_ARGS
+
   - name: create-cluster
     required: true
     description: "Create the k3d cluster"
@@ -52,8 +101,9 @@ components:
               --k3s-arg "--disable=metrics-server@server:*" \
               --k3s-arg "--disable=servicelb@server:*" \
               --k3s-arg "--disable=local-storage@server:*" \
-              --image ${ZARF_VAR_K3D_IMAGE} ${ZARF_VAR_K3D_EXTRA_ARGS} \
-              ${ZARF_VAR_CLUSTER_NAME}
+              ${ZARF_VAR_K3D_EXTRA_ARGS} \
+              --image ${ZARF_VAR_K3D_IMAGE} \
+              "${ZARF_VAR_CLUSTER_NAME}"
             description: "Create the cluster"
         onSuccess:
           - cmd: |
@@ -73,12 +123,12 @@ components:
     actions:
       onDeploy:
         before:
-          - cmd: ./zarf tools kubectl get nodes -o=jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | cut -d'.' -f1-3
-            description: "Load network ip base for MetalLB"
+          - cmd: uds zarf tools kubectl get nodes -o=jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | cut -d'.' -f1-3
+            description: "Load network base IP for MetalLB"
             setVariables:
               - name: BASE_IP
         after:
-          - cmd: ./zarf tools kubectl rollout restart deployment coredns -n kube-system
+          - cmd: uds zarf tools kubectl rollout restart deployment coredns -n kube-system
             description: "Restart CoreDNS to pick up internal DNS override for uds.dev"
     charts:
       - name: metallb
@@ -89,7 +139,7 @@ components:
         namespace: uds-dev-stack
         localPath: chart
         # x-release-please-start-version
-        version: 0.9.0
+        version: 0.11.2
         # x-release-please-end
         valuesFiles:
           - "values/dev-stack-values.yaml"
@@ -104,3 +154,37 @@ components:
         url: https://charts.min.io/
         valuesFiles:
           - "values/minio-values.yaml"
+
+  - name: nvidia-gpu-operator
+    description: "Install the NVIDIA GPU Operator for CUDA-enabled clusters"
+    only:
+      flavor: cuda
+    required: true
+    charts:
+      - name: gpu-operator
+        url: https://helm.ngc.nvidia.com/nvidia
+        version: v24.3.0
+        namespace: kube-system
+        valuesFiles:
+          - "values/nvidia-gpu-operator-values.yaml"
+    actions:
+      onDeploy:
+        after:
+          - description: "Validate nvidia-device-plugin-daemonset is up"
+            wait:
+              cluster:
+                kind: Pod
+                name: app=nvidia-device-plugin-daemonset
+                namespace: kube-system
+                # Ensure the device plugin is healthy, which might take a while depending on the machine 
+                condition: "'{.status.conditions[2].status}'=True"
+            maxTotalSeconds: 600
+          - description: "Validate nvidia-operator-validator is completed"
+            wait:
+              cluster:
+                kind: Pod
+                name: app=nvidia-operator-validator
+                namespace: kube-system
+                # Ensure the NVIDIA host validator job succeeds
+                condition: "'{.status.conditions[2].status}'=True"
+            maxTotalSeconds: 300

From ca327997220cbfe2aa7f7f1a8eb390145e89d145 Mon Sep 17 00:00:00 2001
From: Justin Law <justin.law@defenseunicorns.com>
Date: Wed, 9 Oct 2024 11:22:32 -0400
Subject: [PATCH 2/9] remove incorrect org repo

---
 tasks.yaml | 4 ++--
 zarf.yaml  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tasks.yaml b/tasks.yaml
index 3269a5d..35e0987 100644
--- a/tasks.yaml
+++ b/tasks.yaml
@@ -90,7 +90,7 @@ tasks:
 
   - name: build-cuda-image
     actions:
-      - description: "Build the CUDA K3s image"
+      - description: "Build the k3s-cuda image"
         cmd: |
           docker build -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \
             --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \
@@ -100,7 +100,7 @@ tasks:
 
   - name: publish-cuda-image
     actions:
-      - description: "Publish the CUDA K3s image"
+      - description: "Publish the k3s-cuda image"
         cmd: |
           docker buildx build --push \
             --platform linux/amd64 \
diff --git a/zarf.yaml b/zarf.yaml
index 2abfdca..0761026 100644
--- a/zarf.yaml
+++ b/zarf.yaml
@@ -4,7 +4,7 @@ kind: ZarfPackageConfig
 metadata:
   name: uds-k3d
   description: "UDS K3d Cluster Setup. WARNING: This will destroy the cluster if it already exists."
-  url: https://github.com/justinthelaw/uds-k3d
+  url: https://github.com/defenseunicorns/uds-k3d
   yolo: true
   # x-release-please-start-version
   version: "0.9.0"
@@ -69,7 +69,7 @@ components:
       onDeploy:
         before:
           - cmd: |
-              echo "ghcr.io/justinthelaw/uds-k3d/cuda-k3s:${ZARF_VAR_K3S_IMAGE_VERSION}-cuda-${ZARF_VAR_CUDA_IMAGE_VERSION}"
+              echo "ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${ZARF_VAR_K3S_IMAGE_VERSION}-cuda-${ZARF_VAR_CUDA_IMAGE_VERSION}"
             setVariables:
               - name: K3D_IMAGE
 

From d8c6637d1083d61b3741ac9b81662f0cd12ac54b Mon Sep 17 00:00:00 2001
From: Justin Law <justin.law@defenseunicorns.com>
Date: Wed, 9 Oct 2024 11:24:53 -0400
Subject: [PATCH 3/9] generalize test description name

---
 tasks.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks.yaml b/tasks.yaml
index 35e0987..732c10f 100644
--- a/tasks.yaml
+++ b/tasks.yaml
@@ -77,7 +77,7 @@ tasks:
   - name: validate-cuda
     description: "Run a CUDA test workload in-cluster"
     actions:
-      - description: "Deploy the device query test pod to the cluster"
+      - description: "Deploy the test pod to the cluster"
         cmd: |
           uds zarf tools kubectl apply -f tests/${CUDA_TEST}.yaml
       - description: "Await test completion and then display the test results"

From cec1c4e5fb947d9d27fad9598238acf5b49cff6f Mon Sep 17 00:00:00 2001
From: Justin Law <justin.law@defenseunicorns.com>
Date: Wed, 9 Oct 2024 11:43:57 -0400
Subject: [PATCH 4/9] align build-test matrix with publishing one

---
 .github/workflows/build-test.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml
index 67e7dd2..93ae97c 100644
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -17,7 +17,8 @@ jobs:
 
     strategy:
       matrix:
-        version:
+        k3s_image_repository: ["rancher/k3s"]
+        k3s_image_version:
           ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"]
 
     steps:
@@ -31,7 +32,7 @@ jobs:
           ghToken: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Create and deploy the uds-k3d package
-        run: uds run --set K3S_IMAGE_VERSION=${{matrix.version}} --no-progress
+        run: uds run --set K3S_IMAGE_VERSION=${{matrix.k3s_image_version}} K3S_IMAGE_REPOSITORY=${{matrix.k3s_image_repository}} --no-progress
 
       - name: Validate uds-k3d package
         run: uds run validate --no-progress

From ae9d9b19cda791c521fe4ce3cd8ee4a2b8d65cee Mon Sep 17 00:00:00 2001
From: Justin Law <justin.law@defenseunicorns.com>
Date: Wed, 9 Oct 2024 11:45:52 -0400
Subject: [PATCH 5/9] fix --set typo

---
 .github/workflows/build-test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml
index 93ae97c..b3e29a1 100644
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -32,7 +32,7 @@ jobs:
           ghToken: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Create and deploy the uds-k3d package
-        run: uds run --set K3S_IMAGE_VERSION=${{matrix.k3s_image_version}} K3S_IMAGE_REPOSITORY=${{matrix.k3s_image_repository}} --no-progress
+        run: uds run --set K3S_IMAGE_VERSION=${{matrix.k3s_image_version}} --set K3S_IMAGE_REPOSITORY=${{matrix.k3s_image_repository}} --no-progress
 
       - name: Validate uds-k3d package
         run: uds run validate --no-progress

From 6a8081f568fc379fe975ef8d64b23baad5df0972 Mon Sep 17 00:00:00 2001
From: Justin Law <justin.law@defenseunicorns.com>
Date: Wed, 9 Oct 2024 11:48:00 -0400
Subject: [PATCH 6/9] remove erroneous version, go to 0.9.0

---
 README.md   | 2 +-
 docs/GPU.md | 2 +-
 zarf.yaml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a62108b..ec7f236 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ sudo ssh -N -L 80:localhost:80 -L 443:localhost:443 -L 6550:localhost:6550 <your
 
 <!-- x-release-please-start-version -->
 
-`uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:0.11.2`
+`uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:0.9.0`
 
 <!-- x-release-please-end -->
 
diff --git a/docs/GPU.md b/docs/GPU.md
index 8fabfa5..5a81351 100644
--- a/docs/GPU.md
+++ b/docs/GPU.md
@@ -58,7 +58,7 @@ To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute t
 <!-- x-release-please-start-version -->
 
 ```bash
-export PACKAGE_VERSION=0.11.2
+export PACKAGE_VERSION=0.9.0
 uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm
 ```
 
diff --git a/zarf.yaml b/zarf.yaml
index 0761026..5a61989 100644
--- a/zarf.yaml
+++ b/zarf.yaml
@@ -139,7 +139,7 @@ components:
         namespace: uds-dev-stack
         localPath: chart
         # x-release-please-start-version
-        version: 0.11.2
+        version: 0.9.0
         # x-release-please-end
         valuesFiles:
           - "values/dev-stack-values.yaml"

From 0dc3f1450a055ab2b93cd62345e94c1064693116 Mon Sep 17 00:00:00 2001
From: Justin Law <81255462+justinthelaw@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:02:29 -0400
Subject: [PATCH 7/9] Update GPU.md

---
 docs/GPU.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GPU.md b/docs/GPU.md
index 5a81351..4b8b203 100644
--- a/docs/GPU.md
+++ b/docs/GPU.md
@@ -160,7 +160,7 @@ UDS K3d's NVIDIA GPU support does not work on MacOS.
 
 The NVIDIA GPU Operator does not work on WSL2 as of version v24.3.0 (see [issue](https://github.com/NVIDIA/gpu-operator/issues/318)); however, the NVIDIA Device Plugin, by itself, does work as of version 0.15.0-rc1 (see [comment](https://github.com/NVIDIA/k8s-device-plugin/issues/332#issuecomment-1927997436)).
 
-To get around this issue, the recommended course of action is to install UDS K3d without the `cuda` flavor, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so:
+To get around this issue, the recommended course of action is to install UDS K3d with the `cuda` flavor, delete the NVIDIA GPU Operator deployment, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so:
 
 1. Run `uds run default --set K3D_EXTRA_ARGS="--gpus=all"` or `uds zarf package deploy oci://defenseunicorns/uds-k3d:${PACKAGE_VERSION} --confirm --set K3D_EXTRA_ARGS="--gpus=all"`
 2. Create an `nvidia-device-plugin.yaml` manifest like the one below, and a deploy it with `uds zarf tools kubectl apply -f nvidia-device-plugin.yaml`

From b1174255dc0572046c33dc5e8ad57212ddbe8e34 Mon Sep 17 00:00:00 2001
From: Justin Law <justin.law@defenseunicorns.com>
Date: Fri, 1 Nov 2024 14:54:29 -0400
Subject: [PATCH 8/9] add better docs for core and core-slim-dev

---
 docs/GPU.md | 138 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 96 insertions(+), 42 deletions(-)

diff --git a/docs/GPU.md b/docs/GPU.md
index 4b8b203..8caf257 100644
--- a/docs/GPU.md
+++ b/docs/GPU.md
@@ -2,48 +2,15 @@
 
 UDS K3d comes with optional base images that provide GPU scheduling in the cluster to allow for GPU-accelerated workloads (e.g., LLMs). Currently, UDS K3d only supports NVIDIA CUDA-capable GPUs, with considerations for supporting AMD and other workloads in the future.
 
-## NVIDIA
+## Usage
 
-### Pre-Requisites
+The following usage steps use the `cuda` flavor of the UDS K3d package as an example of how to enable GPU access in a K3d cluster.
 
-### NVIDIA Drivers
+For troubleshooting during UDS K3d deployments, please see each flavor's specific instructions:
 
-- Ensure that the proper [NVIDIA drivers](https://www.nvidia.com/download/index.aspx) are installed (>=525.60).
-- Follow the [driver download](https://www.nvidia.com/download/index.aspx) by identifying your hardware from the provided list.
-
-### NVIDIA Container Toolkit
-
-- [Read the pre-requisites for installation and follow the instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) to download and install the NVIDIA container toolkit (>=1.14).
-- After the successful installation off the toolkit, follow the [toolkit instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) to verify that your default Docker runtime is configured for NVIDIA:
-
-  ```bash
-  nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json
-  ```
-
-- Verify that `nvidia` is now a runtime available to the Docker daemon to use:
-
-  ```bash
-  # the expected output should be similar to: `Runtimes: io.containerd.runc.v2 nvidia runc`
-  docker info | grep -i nvidia
-  ```
-
-- [Try out a sample CUDA workload](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html) to ensure your Docker containers have access to the GPUs after configuration.
-- (OPTIONAL) You can configure Docker to use the `nvidia` runtime by default by adding the `--set-as-default` flag during the container toolkit post-installation configuration step by running the following command:
-
-  ```bash
-  nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json --set-as-default
-  ```
+1. [`cuda` flavor troubleshooting](#nvidia)
 
-- (OPTIONAL) Verify that the default runtime is changed by running the following command:
-
-  ```bash
-  # the expected output should be similar to: `Default Runtime: nvidia`
-  docker info | grep "Default Runtime"
-  ```
-
-### Usage
-
-#### Local Build and Deployment
+### Local Build and Deployment
 
 To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following:
 
@@ -51,7 +18,7 @@ To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute t
 uds run default-cuda
 ```
 
-#### Remote Package Deployment
+### Remote Package Deployment
 
 To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following:
 
@@ -64,7 +31,7 @@ uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE
 
 <!-- x-release-please-end -->
 
-##### Additional Base Images
+#### Additional Base Images
 
 This repository publishes several variations of the underlying K3d image and CUDA image so that it covers more compatibility cases (e.g., GPU driver versions, K3d versions, etc.). Please see the [published images](https://github.com/defenseunicorns/uds-k3d/pkgs/container/uds-k3d%2Fcuda-k3s) for all possible variations.
 
@@ -78,7 +45,7 @@ uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE
 uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="11.8.0-base-ubuntu22.04"
 ```
 
-#### Tests
+### Tests
 
 This repository includes two CUDA workload tests that can be executed:
 
@@ -87,7 +54,94 @@ uds run validate-cuda # device info query
 uds run validate-cuda --set CUDA_TEST="cuda-vector-add" # vector addition
 ```
 
-### Troubleshooting
+### UDS Core
+
+Deploying UDS Core with a UDS K3d cluster capable of GPU support without building your own UDS bundle, that includes UDS K3d's GPU flavors and UDS Core, requires some extra argument in the UDS CLI. Below are examples of deploying the [full UDS Core](#core) and the [developer version of UDS Core](#core-slim-dev).
+
+### Core
+
+To deploy the full set of UDS Core services on top of UDS K3d `cuda`, you can run the following commands:
+
+<!-- x-release-please-end -->
+
+```bash
+export PACKAGE_VERSION=0.9.0
+uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm
+# fill-in with your desired UDS Core version and flavor
+uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds/core:${UDS_CORE_VERSION}-${UDS_CORE_FLAVOR} --confirm
+```
+
+<!-- x-release-please-end -->
+
+### Core Slim Dev
+
+Since the slim development version of UDS Core is only published as a bundle, the `cuda` version of the UDS K3d Zarf package cannot be used directly; therefore, the K3d arguments and NVIDIA GPU operator deployment that are normally handled automatically within this [Zarf package](../zarf.yaml) must be done manually.
+
+To allow GPU access in a UDS Core slim development cluster, the base k3s-cuda image published by this repository must be passed into the bundle deployment command and a separate deployment of one of the following options:
+
+1. [NVIDIA Device Plugin](https://github.com/NVIDIA/k8s-device-plugin)
+2. [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator)
+
+To deploy the slim development set of UDS Core services on top of UDS K3d `cuda`, you can run the following commands:
+
+```bash
+# fill-in with your desired UDS Core version
+# fill-in with your desired k3s-CUDA image published by this repository
+uds deploy k3d-core-slim-dev:${UDS_CORE_SLIM_DEV_VERSION} --set K3D_EXTRA_ARGS="--gpus=all --image=${K3S_CUDA_IMAGE}" --confirm
+
+# OPTION #1: use the NVIDIA Device Plugin from upstream - fill-in the desired version
+uds zarf tools kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${NVIDIA_DEVICE_PLUGIN_VERSION}/deployments/static/nvidia-device-plugin.yml
+
+# OPTION #2: use the NVIDIA GPU Operator's helm repository with this UDS K3d's NVIDIA GPU Operator values file
+#            this options requires helm to be locally installed and for the aforementioned values file to be available
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
+helm repo update
+helm install --wait --generate-name \
+    -n kube-system \
+    --values values/nvidia-gpu-operator-values.yaml \
+    nvidia/gpu-operator
+```
+
+## Troubleshooting
+
+### NVIDIA
+
+#### NVIDIA Pre-Requisites
+
+##### NVIDIA Drivers
+
+- Ensure that the proper [NVIDIA drivers](https://www.nvidia.com/download/index.aspx) are installed (>=525.60).
+- Follow the [driver download](https://www.nvidia.com/download/index.aspx) by identifying your hardware from the provided list.
+
+##### NVIDIA Container Toolkit
+
+- [Read the pre-requisites for installation and follow the instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) to download and install the NVIDIA container toolkit (>=1.14).
+- After the successful installation off the toolkit, follow the [toolkit instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) to verify that your default Docker runtime is configured for NVIDIA:
+
+  ```bash
+  nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json
+  ```
+
+- Verify that `nvidia` is now a runtime available to the Docker daemon to use:
+
+  ```bash
+  # the expected output should be similar to: `Runtimes: io.containerd.runc.v2 nvidia runc`
+  docker info | grep -i nvidia
+  ```
+
+- [Try out a sample CUDA workload](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html) to ensure your Docker containers have access to the GPUs after configuration.
+- (OPTIONAL) You can configure Docker to use the `nvidia` runtime by default by adding the `--set-as-default` flag during the container toolkit post-installation configuration step by running the following command:
+
+  ```bash
+  nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json --set-as-default
+  ```
+
+- (OPTIONAL) Verify that the default runtime is changed by running the following command:
+
+  ```bash
+  # the expected output should be similar to: `Default Runtime: nvidia`
+  docker info | grep "Default Runtime"
+  ```
 
 #### NVML Errors or Missing CUDA Dependencies
 

From 25eb6c6ae76be1bde7959f1f18cf5ef12f74e139 Mon Sep 17 00:00:00 2001
From: Justin Law <81255462+justinthelaw@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:21:58 -0400
Subject: [PATCH 9/9] use lfai nvidia device plugin values

---
 docs/GPU.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GPU.md b/docs/GPU.md
index 8caf257..90252f3 100644
--- a/docs/GPU.md
+++ b/docs/GPU.md
@@ -90,7 +90,7 @@ To deploy the slim development set of UDS Core services on top of UDS K3d `cuda`
 uds deploy k3d-core-slim-dev:${UDS_CORE_SLIM_DEV_VERSION} --set K3D_EXTRA_ARGS="--gpus=all --image=${K3S_CUDA_IMAGE}" --confirm
 
 # OPTION #1: use the NVIDIA Device Plugin from upstream - fill-in the desired version
-uds zarf tools kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${NVIDIA_DEVICE_PLUGIN_VERSION}/deployments/static/nvidia-device-plugin.yml
+uds zarf tools kubectl create -f https://raw.githubusercontent.com/defenseunicorns/leapfrogai/refs/heads/main/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml
 
 # OPTION #2: use the NVIDIA GPU Operator's helm repository with this UDS K3d's NVIDIA GPU Operator values file
 #            this options requires helm to be locally installed and for the aforementioned values file to be available