diff --git a/.github/.gitkeep b/.github/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/.github/workflows/build-test.yaml b/.github/workflows/build-test.yaml index 96ce302..b3e29a1 100644 --- a/.github/workflows/build-test.yaml +++ b/.github/workflows/build-test.yaml @@ -7,35 +7,32 @@ on: - "docs/**" - "CODEOWNERS" -permissions: - id-token: write - contents: read - jobs: test-clean-install: runs-on: ubuntu-latest + + permissions: + id-token: write + contents: read + strategy: matrix: - image: ["rancher/k3s"] - version: ["v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] + k3s_image_repository: ["rancher/k3s"] + k3s_image_version: + ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4 - name: Setup UDS - if: always() uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 with: - username: ${{secrets.IRON_BANK_ROBOT_USERNAME}} - password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}} - - # Step is not currently being used, could be uncommented if custom image support is needed in the future - # - name: Build the custom k3s image - # if: ${{matrix.image}} != "rancher/k3s" - # run: uds run build-image --set VERSION=${{matrix.version}} --no-progress + registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} + registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} - name: Create and deploy the uds-k3d package - run: uds run --set IMAGE_NAME=${{matrix.image}} --set VERSION=${{matrix.version}} --no-progress + run: uds run --set K3S_IMAGE_VERSION=${{matrix.k3s_image_version}} --set K3S_IMAGE_REPOSITORY=${{matrix.k3s_image_repository}} --no-progress - name: Validate uds-k3d package run: uds run validate --no-progress diff --git a/.github/workflows/publish-image.yaml b/.github/workflows/publish-image.yaml deleted file mode 100644 index 423a4f6..0000000 --- a/.github/workflows/publish-image.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: Publish k3s image - -# Workflow is not currently being used, switched to a manual trigger only -on: workflow_dispatch - # push: - # branches: - # - main - # paths: - # - docker/** - # - .github/workflows/publish-image.yaml - -jobs: - publish-k3s-image: - runs-on: ubuntu-latest - strategy: - matrix: - version: ["v1.2.3-k3s1"] # Placeholder - - permissions: - contents: read - packages: write - - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - - - uses: docker/setup-buildx-action@c47758b77c9736f4b2ef4073d4d51994fabfe349 # v3.7.1 - - - name: Setup UDS - if: always() - uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 - with: - username: ${{secrets.IRON_BANK_ROBOT_USERNAME}} - password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}} - - - name: Login to GHCR - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3 - with: - registry: ghcr.io - username: dummy - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Publish the custom k3s image - run: uds run publish-image --set VERSION=${{matrix.version}} --no-progress diff --git a/.github/workflows/tag-and-release.yaml b/.github/workflows/tag-and-release.yaml index 15a0026..ae0833c 100644 --- a/.github/workflows/tag-and-release.yaml +++ b/.github/workflows/tag-and-release.yaml @@ -7,8 +7,10 @@ on: jobs: tag-new-version: - permissions: write-all runs-on: ubuntu-latest + + permissions: write-all + outputs: release_created: ${{ steps.release-flag.outputs.release_created }} steps: @@ -25,28 +27,73 @@ jobs: if: ${{ needs.tag-new-version.outputs.release_created == 'true'}} runs-on: ubuntu-latest + strategy: + matrix: + k3s_image_repository: ["rancher/k3s"] + k3s_image_version: + ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] + permissions: contents: read packages: write steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4 - name: Setup UDS - if: always() uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 with: - username: ${{secrets.IRON_BANK_ROBOT_USERNAME}} - password: ${{secrets.IRON_BANK_ROBOT_PASSWORD}} + registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} + registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} + + - name: Publish the base capability + run: | + uds zarf package create --confirm -a arm64 -o oci://ghcr.io/defenseunicorns/packages \ + --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \ + --set K3S_IMAGE_VERSION=${{ matrix.k3s_image_version }} + + uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages \ + --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \ + --set K3S_IMAGE_VERSION=${{ matrix.k3s_image_version }} + + publish-uds-cuda-package: + needs: tag-new-version + if: ${{ needs.tag-new-version.outputs.release_created == 'true'}} + runs-on: ubuntu-latest + + strategy: + matrix: + k3s_image_repository: ["rancher/k3s"] + k3s_image_version: + ["v1.28.8-k3s1", "v1.29.8-k3s1", "v1.30.4-k3s1", "v1.31.0-k3s1"] + cuda_image_version: + [ + 11.8.0-base-ubuntu22.04, + 12.1.0-base-ubuntu22.04, + 12.5.0-base-ubuntu22.04, + ] + + steps: + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4 - - name: Login to GHCR - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3 + - uses: docker/setup-buildx-action@8026d2bc3645ea78b0d2544766a1225eb5691f89 # v3.7.0 + + - name: Setup UDS + uses: defenseunicorns/uds-common/.github/actions/setup@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 with: - registry: ghcr.io - username: dummy - password: ${{ secrets.GITHUB_TOKEN }} + registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} + registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} + + - name: Publish the CUDA K3s image + run: | + uds run publish-cuda-image \ + --set K3S_IMAGE_REPOSITORY=${{ matrix.k3s_image_repository }} \ + --set K3S_IMAGE_VERSION="${{ matrix.k3s_image_version }}" \ + --set CUDA_IMAGE_VERSION="${{ matrix.cuda_image_version }}" \ + --no-progress - - name: Publish the capability + - name: Publish the CUDA capability run: | - uds zarf package create --confirm -a arm64 -o oci://ghcr.io/defenseunicorns/packages - uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages + uds zarf package create --confirm -a amd64 -o oci://ghcr.io/defenseunicorns/packages --flavor cuda diff --git a/README.md b/README.md index ab4b3d7..ec7f236 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,17 @@ sudo ssh -N -L 80:localhost:80 -L 443:localhost:443 -L 6550:localhost:6550 [!NOTE] > UDS K3d intentionally does not address airgap concerns for K3d or the load balancer logic deployed in this package. This allows running `zarf init` or deploying a Zarf Init Package via a UDS Bundle after the UDS K3d environment is deployed. -## Prerequisites +## Pre-Requisites - [UDS CLI](https://github.com/defenseunicorns/uds-cli/blob/main/README.md#install) & [K3d](https://k3d.io/#installation) using the versions specified in the [uds-common repo](https://github.com/defenseunicorns/uds-common/blob/main/README.md#supported-tool-versions) - [Docker](https://docs.docker.com/get-docker/) or [Podman](https://podman.io/getting-started/installation) for running K3d +- See the [GPU Configuration](./docs/GPU.md) information for more details on enabling NVIDIA GPU support within the cluster ## Deploy -`uds zarf package deploy oci://defenseunicorns/uds-k3d:0.9.0` +`uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:0.9.0` @@ -53,13 +54,13 @@ k3d cluster start uds ## Additional Info -You can set extra k3d args by setting the deploy-time ZARF_VAR_K3D_EXTRA_ARGS. See below `zarf-config.yaml` example k3d args: +You can set extra K3d arguments by setting the deploy-time `ZARF_VAR_K3D_EXTRA_ARGS`. See below `zarf-config.yaml` example below for K3d args examples: ```yaml package: deploy: set: - k3d_extra_args: "--k3s-arg --gpus=1 --k3s-arg --=" + k3d_extra_args: --k3s-arg "--=@server:*" --gpus=all ``` ### Configure MinIO @@ -69,3 +70,7 @@ package: ### DNS Assumptions - [DNS Assumptions](docs/DNS.md) + +### Enabling GPU Support + +- [GPU Workload Configuration](docs/GPU.md) diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 8c69e70..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -ARG K3S_TAG="v1.31.2-k3s1" # Placeholder - -FROM rancher/k3s:$K3S_TAG as k3s - -# Do custom image things diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu new file mode 100644 index 0000000..1e7339b --- /dev/null +++ b/docker/Dockerfile.gpu @@ -0,0 +1,35 @@ +ARG K3S_REPOSITORY="rancher/k3s" +ARG K3S_TAG="v1.30.4-k3s1" +ARG CUDA_TAG="12.1.0-base-ubuntu22.04" + +FROM $K3S_REPOSITORY:$K3S_TAG AS k3s + +FROM nvidia/cuda:$CUDA_TAG + +# Install the NVIDIA container toolkit +RUN apt-get update && \ + apt-get install -y curl && \ + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \ + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + apt-get update && \ + apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux && \ + nvidia-ctk runtime configure --runtime=containerd + +COPY --from=k3s / / --exclude=/bin/ +COPY --from=k3s /bin /bin + +VOLUME /var/lib/kubelet +VOLUME /var/lib/rancher/k3s +VOLUME /var/lib/cni +VOLUME /var/log + +# Resolve fsnotify issues +RUN sysctl -w fs.inotify.max_user_watches=100000 && \ + sysctl -w fs.inotify.max_user_instances=100000 + +ENV PATH="$PATH:/bin/aux" + +ENTRYPOINT ["/bin/k3s"] +CMD ["agent"] diff --git a/docs/GPU.md b/docs/GPU.md new file mode 100644 index 0000000..90252f3 --- /dev/null +++ b/docs/GPU.md @@ -0,0 +1,284 @@ +# GPU + +UDS K3d comes with optional base images that provide GPU scheduling in the cluster to allow for GPU-accelerated workloads (e.g., LLMs). Currently, UDS K3d only supports NVIDIA CUDA-capable GPUs, with considerations for supporting AMD and other workloads in the future. + +## Usage + +The following usage steps use the `cuda` flavor of the UDS K3d package as an example of how to enable GPU access in a K3d cluster. + +For troubleshooting during UDS K3d deployments, please see each flavor's specific instructions: + +1. [`cuda` flavor troubleshooting](#nvidia) + +### Local Build and Deployment + +To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following: + +```bash +uds run default-cuda +``` + +### Remote Package Deployment + +To use the NVIDIA CUDA K3s image when bootstrapping a UDS K3d cluster, execute the following: + + + +```bash +export PACKAGE_VERSION=0.9.0 +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm +``` + + + +#### Additional Base Images + +This repository publishes several variations of the underlying K3d image and CUDA image so that it covers more compatibility cases (e.g., GPU driver versions, K3d versions, etc.). Please see the [published images](https://github.com/defenseunicorns/uds-k3d/pkgs/container/uds-k3d%2Fcuda-k3s) for all possible variations. + +Below are some examples of setting these variables to choose a different variation at deploy-time: + +```bash +uds run default-cuda --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="12.1.0-base-ubuntu22.04" +# OR +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.31.0-k3s1" --set CUDA_IMAGE_VERSION="12.5.0-base-ubuntu22.04" +# OR +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm --set K3S_IMAGE_VERSION="v1.29.8-k3s1" --set CUDA_IMAGE_VERSION="11.8.0-base-ubuntu22.04" +``` + +### Tests + +This repository includes two CUDA workload tests that can be executed: + +```bash +uds run validate-cuda # device info query +uds run validate-cuda --set CUDA_TEST="cuda-vector-add" # vector addition +``` + +### UDS Core + +Deploying UDS Core with a UDS K3d cluster capable of GPU support without building your own UDS bundle, that includes UDS K3d's GPU flavors and UDS Core, requires some extra argument in the UDS CLI. Below are examples of deploying the [full UDS Core](#core) and the [developer version of UDS Core](#core-slim-dev). + +### Core + +To deploy the full set of UDS Core services on top of UDS K3d `cuda`, you can run the following commands: + + + +```bash +export PACKAGE_VERSION=0.9.0 +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds-k3d:${PACKAGE_VERSION}-cuda --confirm +# fill-in with your desired UDS Core version and flavor +uds zarf package deploy oci://ghcr.io/defenseunicorns/packages/uds/core:${UDS_CORE_VERSION}-${UDS_CORE_FLAVOR} --confirm +``` + + + +### Core Slim Dev + +Since the slim development version of UDS Core is only published as a bundle, the `cuda` version of the UDS K3d Zarf package cannot be used directly; therefore, the K3d arguments and NVIDIA GPU operator deployment that are normally handled automatically within this [Zarf package](../zarf.yaml) must be done manually. + +To allow GPU access in a UDS Core slim development cluster, the base k3s-cuda image published by this repository must be passed into the bundle deployment command and a separate deployment of one of the following options: + +1. [NVIDIA Device Plugin](https://github.com/NVIDIA/k8s-device-plugin) +2. [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) + +To deploy the slim development set of UDS Core services on top of UDS K3d `cuda`, you can run the following commands: + +```bash +# fill-in with your desired UDS Core version +# fill-in with your desired k3s-CUDA image published by this repository +uds deploy k3d-core-slim-dev:${UDS_CORE_SLIM_DEV_VERSION} --set K3D_EXTRA_ARGS="--gpus=all --image=${K3S_CUDA_IMAGE}" --confirm + +# OPTION #1: use the NVIDIA Device Plugin from upstream - fill-in the desired version +uds zarf tools kubectl create -f https://raw.githubusercontent.com/defenseunicorns/leapfrogai/refs/heads/main/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml + +# OPTION #2: use the NVIDIA GPU Operator's helm repository with this UDS K3d's NVIDIA GPU Operator values file +# this options requires helm to be locally installed and for the aforementioned values file to be available +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +helm repo update +helm install --wait --generate-name \ + -n kube-system \ + --values values/nvidia-gpu-operator-values.yaml \ + nvidia/gpu-operator +``` + +## Troubleshooting + +### NVIDIA + +#### NVIDIA Pre-Requisites + +##### NVIDIA Drivers + +- Ensure that the proper [NVIDIA drivers](https://www.nvidia.com/download/index.aspx) are installed (>=525.60). +- Follow the [driver download](https://www.nvidia.com/download/index.aspx) by identifying your hardware from the provided list. + +##### NVIDIA Container Toolkit + +- [Read the pre-requisites for installation and follow the instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt) to download and install the NVIDIA container toolkit (>=1.14). +- After the successful installation off the toolkit, follow the [toolkit instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker) to verify that your default Docker runtime is configured for NVIDIA: + + ```bash + nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json + ``` + +- Verify that `nvidia` is now a runtime available to the Docker daemon to use: + + ```bash + # the expected output should be similar to: `Runtimes: io.containerd.runc.v2 nvidia runc` + docker info | grep -i nvidia + ``` + +- [Try out a sample CUDA workload](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html) to ensure your Docker containers have access to the GPUs after configuration. +- (OPTIONAL) You can configure Docker to use the `nvidia` runtime by default by adding the `--set-as-default` flag during the container toolkit post-installation configuration step by running the following command: + + ```bash + nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json --set-as-default + ``` + +- (OPTIONAL) Verify that the default runtime is changed by running the following command: + + ```bash + # the expected output should be similar to: `Default Runtime: nvidia` + docker info | grep "Default Runtime" + ``` + +#### NVML Errors or Missing CUDA Dependencies + +None of the following should ever error or return `unknown version`: + +1. Check if your NVIDIA GPU drivers are installed: + + ```bash + nvidia-smi + ``` + +2. Check the version of your NVIDIA Container Toolkit: + + ```bash + nvidia-ctk --version + ``` + +3. Try looking at your Docker runtime information and make sure the following returns with several lines of information: + + ```bash + docker info | grep "nvidia" + ``` + +4. Try running a CUDA sample test in the cluster: [CUDA Vector Add](https://github.com/NVIDIA/k8s-device-plugin/blob/a6a7ce12d28618d343c251ca0941222d7b8a46d3/README.md?plain=1#L145). + +#### Memory Errors or Process Locks + +If you are, not deploying a fresh cluster or fresh packages (e.g., a GPU workload is already deployed) or you have a GPU that has other workloads on it (e.g., display), then there may not be enough resources to offload the workloads to the NVIDIA GPU. + +1. To see what host-level processes are on your NVIDIA GPU(s) run the following: + + ```bash + nvidia-smi + ``` + +2. To check which pods are scheduled with GPU resources in particular, you can run the following `uds zarf tools yq` command: + + ```bash + uds zarf tools kubectl get pods \ + --all-namespaces \ + --output=yaml \ + | uds zarf tools yq eval -o=json ' + ["Pod", "Namespace", "Container", "GPU"] as $header | + [$header] + [ + .items[] | + .metadata as $metadata | + .spec.containers[] | + select(.resources.requests["nvidia.com/gpu"]) | + [ + $metadata.name, + $metadata.namespace, + .name, + .resources.requests["nvidia.com/gpu"] + ] + ]' - \ + | uds zarf tools yq -r '(.[0] | @tsv), (.[1:][] | @tsv)' \ + | column -t -s $'\t' + ``` + +When you reinstall or start a new GPU-dependent pod, the previous PID (process) on the GPU may not have been flushed yet. + +1. Scale the previous GPU-dependent pod deployment down to 0, as the current `RollingUpdate` strategy for vLLM relies on back-up/secondary GPUs to be available for a graceful turnover +2. Use `nvidia-smi` to check if the process has been flushed upon Pod termination BEFORE you deploy a new GPU-dependent pod, and if not, use `kill -9 ` to manually flush the process + +#### MacOS + +UDS K3d's NVIDIA GPU support does not work on MacOS. + +#### Windows (WSL2) + +The NVIDIA GPU Operator does not work on WSL2 as of version v24.3.0 (see [issue](https://github.com/NVIDIA/gpu-operator/issues/318)); however, the NVIDIA Device Plugin, by itself, does work as of version 0.15.0-rc1 (see [comment](https://github.com/NVIDIA/k8s-device-plugin/issues/332#issuecomment-1927997436)). + +To get around this issue, the recommended course of action is to install UDS K3d with the `cuda` flavor, delete the NVIDIA GPU Operator deployment, and then deploy the NVIDIA Device Plugin separately. Below are the steps for doing so: + +1. Run `uds run default --set K3D_EXTRA_ARGS="--gpus=all"` or `uds zarf package deploy oci://defenseunicorns/uds-k3d:${PACKAGE_VERSION} --confirm --set K3D_EXTRA_ARGS="--gpus=all"` +2. Create an `nvidia-device-plugin.yaml` manifest like the one below, and a deploy it with `uds zarf tools kubectl apply -f nvidia-device-plugin.yaml` + + ```yaml + apiVersion: node.k8s.io/v1 + kind: RuntimeClass + metadata: + name: nvidia + handler: nvidia + --- + apiVersion: apps/v1 + kind: DaemonSet + metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system + spec: + selector: + matchLabels: + name: nvidia-device-plugin-daemonset + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-daemonset + spec: + runtimeClassName: nvidia # Explicitly request the runtime + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 + name: nvidia-device-plugin-ctr + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + - name: MPS_ROOT + value: /run/nvidia/mps + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + ``` diff --git a/release-please-config.json b/release-please-config.json index 85c883d..97e8e28 100644 --- a/release-please-config.json +++ b/release-please-config.json @@ -10,7 +10,12 @@ { "type": "chore", "section": "Miscellaneous", "hidden": false } ], "versioning": "default", - "extra-files": ["README.md", "zarf.yaml", "chart/Chart.yaml"] + "extra-files": [ + "README.md", + "zarf.yaml", + "chart/Chart.yaml", + "docs/GPU.md" + ] } } } diff --git a/tasks.yaml b/tasks.yaml index 00fb80c..732c10f 100644 --- a/tasks.yaml +++ b/tasks.yaml @@ -1,38 +1,63 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.16.0/tasks.schema.json + variables: - - name: VERSION - default: "v1.30.4-k3s1" - - name: IMAGE_NAME + - name: K3S_IMAGE_REPOSITORY default: "rancher/k3s" + - name: K3S_IMAGE_VERSION + default: "v1.30.4-k3s1" + - name: CUDA_IMAGE_VERSION + default: "12.1.0-base-ubuntu22.04" - name: K3D_EXTRA_ARGS default: "" - name: NGINX_EXTRA_PORTS default: "[]" + - name: CUDA_TEST + default: "cuda-device-query" tasks: - name: default description: "Build and deploy uds-k3d" actions: - - description: "Build UDS K3d package" - cmd: "uds zarf package create --confirm --no-progress" + - description: "Build the uds-k3d package" + cmd: "rm -rf zarf-package-uds-k3d-*.tar.zst && uds zarf package create --confirm --no-progress" - - description: "Deploy UDS K3d package" + - description: "Deploy the uds-k3d package" cmd: | uds zarf package deploy zarf-package-uds-k3d-*.tar.zst \ - --set K3D_IMAGE=${IMAGE_NAME}:${VERSION} \ + --set K3S_IMAGE_REPOSITORY=${K3S_IMAGE_REPOSITORY} \ + --set K3S_IMAGE_VERSION=${K3S_IMAGE_VERSION} \ + --set K3D_EXTRA_ARGS="${K3D_EXTRA_ARGS}" \ + --set NGINX_EXTRA_PORTS="${NGINX_EXTRA_PORTS}" \ + --no-progress --confirm + + - name: default-cuda + description: "Build and deploy uds-k3d with CUDA support" + actions: + - description: "Build the uds-k3d CUDA package" + cmd: "rm -rf zarf-package-uds-k3d-*.tar.zst && uds zarf package create --flavor cuda --confirm --no-progress" + + - description: "Build the k3s-cuda image locally" + task: build-cuda-image + + - description: "Deploy the uds-k3d CUDA package" + cmd: | + uds zarf package deploy zarf-package-uds-k3d-*.tar.zst \ + --set K3S_IMAGE_VERSION=${K3S_IMAGE_VERSION} \ + --set CUDA_IMAGE_VERSION=${CUDA_IMAGE_VERSION} \ --set K3D_EXTRA_ARGS="${K3D_EXTRA_ARGS}" \ --set NGINX_EXTRA_PORTS="${NGINX_EXTRA_PORTS}" \ --no-progress --confirm - name: validate actions: - - description: Validate coredns is up + - description: "Validate CoreDNS is up" wait: cluster: kind: Pod name: "k8s-app=kube-dns" namespace: kube-system condition: Ready - - description: Validate coredns is resolving *.uds.dev internally + - description: "Validate CoreDNS is resolving *.uds.dev internally" cmd: | set -e FOO_IP=$(uds zarf tools kubectl run dig-test --image=arunvelsriram/utils -q --restart=Never --rm -i -- dig +short foo.uds.dev) @@ -42,19 +67,45 @@ tasks: echo "CoreDNS patch failed, foo.uds.dev is resolving to 127.0.0.1" exit 1 fi - - description: Validate zarf init + - description: "Validate zarf init" cmd: | set -e uds zarf tools download-init --no-progress # Test zarf init due to containerd issue - https://github.com/defenseunicorns/zarf/issues/592 uds zarf init --confirm --no-progress - # - name: build-image - # actions: - # - description: Build the custom k3s image - # cmd: docker build -t ${IMAGE_NAME}:${VERSION} --build-arg K3S_TAG=${VERSION} docker/ + - name: validate-cuda + description: "Run a CUDA test workload in-cluster" + actions: + - description: "Deploy the test pod to the cluster" + cmd: | + uds zarf tools kubectl apply -f tests/${CUDA_TEST}.yaml + - description: "Await test completion and then display the test results" + cmd: | + uds zarf tools wait-for Pod cuda-test-pod '{.status.phase}'=Succeeded -n default --no-progress --timeout 120s + uds zarf tools kubectl logs -l app=cuda-test-pod -n default + - description: "Remove the completed test pod" + cmd: | + uds zarf tools kubectl delete Pod cuda-test-pod + + - name: build-cuda-image + actions: + - description: "Build the k3s-cuda image" + cmd: | + docker build -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \ + --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \ + --build-arg K3S_TAG=${K3S_IMAGE_VERSION} \ + --build-arg CUDA_TAG=${CUDA_IMAGE_VERSION} \ + -f docker/Dockerfile.gpu . -q - # - name: publish-image - # actions: - # - description: Publish the custom k3s image - # cmd: docker buildx build --push --platform linux/arm64/v8,linux/amd64 --tag ${IMAGE_NAME}:${VERSION} docker + - name: publish-cuda-image + actions: + - description: "Publish the k3s-cuda image" + cmd: | + docker buildx build --push \ + --platform linux/amd64 \ + --build-arg K3S_REPOSITORY=${K3S_IMAGE_REPOSITORY} \ + --build-arg K3S_TAG=${K3S_IMAGE_VERSION} \ + --build-arg CUDA_TAG=${CUDA_IMAGE_VERSION} \ + -t ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${K3S_IMAGE_VERSION}-cuda-${CUDA_IMAGE_VERSION} \ + -f docker/Dockerfile.gpu . -q diff --git a/tests/cuda-device-query.yaml b/tests/cuda-device-query.yaml new file mode 100644 index 0000000..b1b95da --- /dev/null +++ b/tests/cuda-device-query.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: cuda-test-pod + labels: + app: cuda-test-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0-ubuntu22.04 + resources: + limits: + nvidia.com/gpu: "1" # requesting 1 GPU + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/tests/cuda-vector-add.yaml b/tests/cuda-vector-add.yaml new file mode 100644 index 0000000..a62ba98 --- /dev/null +++ b/tests/cuda-vector-add.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: cuda-test-pod + labels: + app: cuda-test-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04 + resources: + limits: + nvidia.com/gpu: "1" # requesting 1 GPU + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/values/nvidia-gpu-operator-values.yaml b/values/nvidia-gpu-operator-values.yaml new file mode 100644 index 0000000..a35fe71 --- /dev/null +++ b/values/nvidia-gpu-operator-values.yaml @@ -0,0 +1,284 @@ +# See the NVIDIA GPU Operator repository for more details on available values +# https://github.com/NVIDIA/gpu-operator/blob/main/deployments/gpu-operator/values.yaml + +platform: + openshift: false + +nfd: + # usually enabled by default, but choose to use external NFD from IronBank + enabled: true + nodefeaturerules: false + +psa: + enabled: false + +cdi: + enabled: false + +sandboxWorkloads: + enabled: false + +hostPaths: + # rootFS represents the path to the root filesystem of the host. + # This is used by components that need to interact with the host filesystem + # and as such this must be a chroot-able filesystem. + # Examples include the MIG Manager and Toolkit Container which may need to + # stop, start, or restart systemd services + rootFS: "/" + + # driverInstallDir represents the root at which driver files including libraries, + # config files, and executables can be found. + driverInstallDir: "/run/nvidia/driver" + +daemonsets: + labels: {} + annotations: {} + priorityClassName: system-node-critical + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # configuration for controlling update strategy("OnDelete" or "RollingUpdate") of GPU Operands + # note that driver Daemonset is always set with OnDelete to avoid unintended disruptions + updateStrategy: "RollingUpdate" + # configuration for controlling rolling update of GPU Operands + rollingUpdate: + # maximum number of nodes to simultaneously apply pod updates on. + # can be specified either as number or percentage of nodes. Default 1. + maxUnavailable: "1" + +validator: + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: [] + args: [] + resources: {} + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + driver: + env: + - name: DISABLE_DEV_CHAR_SYMLINK_CREATION + value: "true" + - name: NVIDIA_VISIBLE_DEVICES + value: all + # Default value of "all" causes the "display" capability to also be considered; + # however, not all hosts have or allow that capability, causing the daemonset to fail + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + +operator: + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + priorityClassName: system-node-critical + # Can be set to `containerd`, `docker`, etc. + defaultRuntime: docker + runtimeClass: nvidia + use_ocp_driver_toolkit: false + # cleanup CRD on chart un-install + cleanupCRD: false + # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag + # to be passed during helm upgrade. + upgradeCRD: false + initContainer: + image: cuda + + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + annotations: + openshift.io/scc: restricted-readonly + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: In + values: [""] + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: In + values: [""] + logging: + # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 'rfc3339nano') + timeEncoding: epoch + # Zap Level to configure the verbosity of logging. Can be one of 'debug', 'info', 'error', or any integer value > 0 which corresponds to custom debug levels of increasing verbosity + level: info + # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn) + # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error) + develMode: true + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + +mig: + strategy: single + +driver: + # usually enabled by default, depends on deployment environment + enabled: false + +toolkit: + # usually enabled by default, depends on deployment environment + enabled: false + +devicePlugin: + enabled: true + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + # Default value of "all" causes the "display" capability to also be considered; + # however, not all hosts have or allow that capability, causing the daemonset to fail + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + resources: {} + config: + # Create a ConfigMap (default: false) + create: false + # ConfigMap name (either exiting or to create a new one with create=true above) + name: "" + # Default config name within the ConfigMap + default: "" + # Data section for the ConfigMap to create (i.e only applies when create=true) + data: {} + # MPS related configuration for the plugin + mps: + # MPS root path on the host + root: "/run/nvidia/mps" + +# standalone dcgm host engine +dcgm: + # disabled by default to use embedded nv-host engine by exporter + enabled: false + +dcgmExporter: + enabled: true + imagePullPolicy: IfNotPresent + env: + - name: DCGM_EXPORTER_LISTEN + value: ":9400" + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + - name: DCGM_EXPORTER_COLLECTORS + value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + resources: {} + serviceMonitor: + enabled: false + interval: 15s + honorLabels: false + additionalLabels: {} + relabelings: [] + +gfd: + enabled: true + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: "true" + resources: {} + +migManager: + # usually enabled by default, depends on deployment environment + enabled: false + +nodeStatusExporter: + enabled: false + +gds: + enabled: false + +gdrcopy: + enabled: false + +vgpuManager: + enabled: false + +vgpuDeviceManager: + # usually enabled by default, depends on deployment environment + enabled: false + +vfioManager: + # usually enabled by default, depends on deployment environment + enabled: false + +kataManager: + enabled: false + +sandboxDevicePlugin: + # usually enabled by default, depends on deployment environment + enabled: false + +ccManager: + enabled: false + +node-feature-discovery: + enableNodeFeatureApi: true + gc: + enable: true + replicaCount: 1 + serviceAccount: + name: node-feature-discovery + create: false + worker: + serviceAccount: + name: node-feature-discovery + # disable creation to avoid duplicate serviceaccount creation by master spec below + create: false + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + config: + sources: + pci: + deviceClassWhitelist: + - "02" + - "0200" + - "0207" + - "0300" + - "0302" + deviceLabelFields: + - vendor + master: + serviceAccount: + name: node-feature-discovery + create: true + config: + extraLabelNs: ["nvidia.com"] diff --git a/zarf.yaml b/zarf.yaml index bd00b2b..95b2b63 100644 --- a/zarf.yaml +++ b/zarf.yaml @@ -15,9 +15,21 @@ variables: description: "Name of the cluster" default: "uds" - - name: K3D_IMAGE - description: "K3d image to use" - default: "rancher/k3s:v1.30.4-k3s1" + - name: K3S_IMAGE_REPOSITORY + description: "K3s image repository to use" + default: "rancher/k3s" + + - name: K3S_IMAGE_VERSION + description: "K3d image version to use" + default: "v1.30.4-k3s1" + + - name: CUDA_IMAGE_VERSION + description: "CUDA image to use" + default: "12.1.0-base-ubuntu22.04" + + - name: NUMBER_OF_GPUS + description: "Number of GPUs to passthrough to the K3D cluster" + default: "all" - name: K3D_EXTRA_ARGS description: "Optionally pass k3d arguments to the default" @@ -37,6 +49,43 @@ components: - cmd: k3d cluster delete ${ZARF_VAR_CLUSTER_NAME} description: "Destroy the cluster" + - name: set-k3d-image + required: true + description: "Set the K3s base image" + actions: + onDeploy: + before: + - cmd: | + echo "${ZARF_VAR_K3S_IMAGE_REPOSITORY}:${ZARF_VAR_K3S_IMAGE_VERSION}" + setVariables: + - name: K3D_IMAGE + + - name: inject-cuda-image + required: true + only: + flavor: cuda + description: "Overwrites the K3s base image variable to be the CUDA K3s image" + actions: + onDeploy: + before: + - cmd: | + echo "ghcr.io/defenseunicorns/uds-k3d/cuda-k3s:${ZARF_VAR_K3S_IMAGE_VERSION}-cuda-${ZARF_VAR_CUDA_IMAGE_VERSION}" + setVariables: + - name: K3D_IMAGE + + - name: expose-gpus + required: true + only: + flavor: cuda + description: "Adds the extra K3d argument for exposing host GPUs to the cluster" + actions: + onDeploy: + before: + - cmd: | + echo "${ZARF_VAR_K3D_EXTRA_ARGS} --gpus=${ZARF_VAR_NUMBER_OF_GPUS}" + setVariables: + - name: K3D_EXTRA_ARGS + - name: create-cluster required: true description: "Create the k3d cluster" @@ -52,8 +101,9 @@ components: --k3s-arg "--disable=metrics-server@server:*" \ --k3s-arg "--disable=servicelb@server:*" \ --k3s-arg "--disable=local-storage@server:*" \ - --image ${ZARF_VAR_K3D_IMAGE} ${ZARF_VAR_K3D_EXTRA_ARGS} \ - ${ZARF_VAR_CLUSTER_NAME} + ${ZARF_VAR_K3D_EXTRA_ARGS} \ + --image ${ZARF_VAR_K3D_IMAGE} \ + "${ZARF_VAR_CLUSTER_NAME}" description: "Create the cluster" onSuccess: - cmd: | @@ -73,12 +123,12 @@ components: actions: onDeploy: before: - - cmd: ./zarf tools kubectl get nodes -o=jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | cut -d'.' -f1-3 - description: "Load network ip base for MetalLB" + - cmd: uds zarf tools kubectl get nodes -o=jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | cut -d'.' -f1-3 + description: "Load network base IP for MetalLB" setVariables: - name: BASE_IP after: - - cmd: ./zarf tools kubectl rollout restart deployment coredns -n kube-system + - cmd: uds zarf tools kubectl rollout restart deployment coredns -n kube-system description: "Restart CoreDNS to pick up internal DNS override for uds.dev" charts: - name: metallb @@ -104,3 +154,37 @@ components: url: https://charts.min.io/ valuesFiles: - "values/minio-values.yaml" + + - name: nvidia-gpu-operator + description: "Install the NVIDIA GPU Operator for CUDA-enabled clusters" + only: + flavor: cuda + required: true + charts: + - name: gpu-operator + url: https://helm.ngc.nvidia.com/nvidia + version: v24.3.0 + namespace: kube-system + valuesFiles: + - "values/nvidia-gpu-operator-values.yaml" + actions: + onDeploy: + after: + - description: "Validate nvidia-device-plugin-daemonset is up" + wait: + cluster: + kind: Pod + name: app=nvidia-device-plugin-daemonset + namespace: kube-system + # Ensure the device plugin is healthy, which might take a while depending on the machine + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 600 + - description: "Validate nvidia-operator-validator is completed" + wait: + cluster: + kind: Pod + name: app=nvidia-operator-validator + namespace: kube-system + # Ensure the NVIDIA host validator job succeeds + condition: "'{.status.conditions[2].status}'=True" + maxTotalSeconds: 300