diff --git a/.github/actions/release/action.yaml b/.github/actions/release/action.yaml
index 63afd9b1d..38157f59e 100644
--- a/.github/actions/release/action.yaml
+++ b/.github/actions/release/action.yaml
@@ -138,7 +138,7 @@ runs:
       run: |
         docker buildx build --build-arg LOCAL_VERSION=${{ inputs.releaseTag }} -t ghcr.io/defenseunicorns/leapfrogai/vllm:${{ inputs.releaseTag }} --push -f packages/vllm/Dockerfile .
 
-        zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm
+        ZARF_CONFIG=packages/vllm/zarf-config.yaml zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm
 
         zarf package publish zarf-package-vllm-amd64-${{ inputs.releaseTag }}.tar.zst oci://ghcr.io/defenseunicorns/packages${{ inputs.subRepository }}leapfrogai
 
diff --git a/.github/release-please-config.json b/.github/release-please-config.json
index 6b03844af..8f475204d 100644
--- a/.github/release-please-config.json
+++ b/.github/release-please-config.json
@@ -11,7 +11,6 @@
       "versioning": "default",
       "extra-files": [
         "pyproject.toml",
-        ".github/workflows/nightly-snapshot-release.yaml",
         {
           "type": "generic",
           "path": "**/Chart.yaml",
@@ -27,6 +26,11 @@
           "path": "**/zarf.yaml",
           "glob": true
         },
+        {
+          "type": "generic",
+          "path": "**/zarf-config.yaml",
+          "glob": true
+        },
         {
           "type": "generic",
           "path": "**/uds-bundle.yaml",
diff --git a/.github/scripts/uds_verification_report.py b/.github/scripts/uds_verification_report.py
new file mode 100755
index 000000000..0e4d4e8fe
--- /dev/null
+++ b/.github/scripts/uds_verification_report.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+
+import os
+import re
+
+
+def remove_ansi_escape_sequences(text):
+    ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+    return ansi_escape.sub("", text)
+
+
+# Capabilities that affect the entire capability, not just a single package
+def uds_capability_wide_errors(text: str) -> bool:
+    if "Not all pods have the istio sidecar" in text:
+        return True
+    return False
+
+
+# CI environment variable enables GitHub annotations
+def print_package_info(
+    package_name,
+    failures_count,
+    errors_count,
+    warnings_count,
+    failure_descriptions,
+    error_descriptions,
+    warning_descriptions,
+    uds_capability_wide_errors_count,
+):
+    if uds_capability_wide_errors_count >= 1:
+        errors_count -= uds_capability_wide_errors_count
+    if package_name:
+        print("-----------------------------")
+        if os.getenv("CI") == "true":
+            print(f"::group::{package_name}")
+        print(f"Package: {package_name}\n")
+        if failures_count > 0:
+            if os.getenv("CI") == "true":
+                print("::error::", end="")
+            print(f"⛔ Failures: {failures_count}")
+        else:
+            if errors_count > 0:
+                if os.getenv("CI") == "true":
+                    print("::error::", end="")
+                print(f"❌ Errors: {errors_count}")
+            if warnings_count > 0:
+                if os.getenv("CI") == "true":
+                    print("::warning::", end="")
+                print(f"⚠️  Warnings: {warnings_count}")
+        if failures_count > 0:
+            print("\n⛔ Failure Descriptions:")
+            for desc in failure_descriptions:
+                print(f"  - {desc}")
+        else:
+            if errors_count > 0:
+                print("\n❌ Error Descriptions:")
+                for desc in error_descriptions:
+                    print(f"  - {desc}")
+            if warnings_count > 0:
+                print("\n⚠️  Warning Descriptions:")
+                for desc in warning_descriptions:
+                    print(f"  - {desc}")
+        if os.getenv("CI") == "true":
+            print("::endgroup::")
+
+
+def main():
+    # Read data from the specified file instead of stdin
+    file_path = os.path.join(
+        os.getenv("GITHUB_WORKSPACE", ""), "reports/intermediate-report.txt"
+    )
+    with open(file_path, mode="r", encoding="utf-8", errors="ignore") as file:
+        data = file.read()
+    # Remove ANSI escape sequences
+    clean_data = remove_ansi_escape_sequences(data)
+    # Initialize variables
+    package_name = ""
+    failures_count = 0
+    errors_count = 0
+    warnings_count = 0
+    uds_capability_wide_errors_count = 0
+    failure_descriptions = []
+    error_descriptions = []
+    warning_descriptions = []
+    uds_capability_wide_error_descriptions = []
+    previous_package_name = None
+
+    # Process each line
+    for line in clean_data.splitlines():
+        # Remove leading and trailing whitespace
+        line = line.strip()
+
+        # Match and extract the package name
+        match = re.match(r"^ℹ️\s+Package\s+Name:\s+(.*)$", line)
+        if match:
+            # Print the previous package's info before starting a new one
+            if previous_package_name is not None:
+                print_package_info(
+                    previous_package_name,
+                    failures_count,
+                    errors_count,
+                    warnings_count,
+                    failure_descriptions,
+                    error_descriptions,
+                    warning_descriptions,
+                    uds_capability_wide_errors_count,
+                )
+            # Reset variables for the new package
+            package_name = match.group(1)
+            failures_count = 0
+            errors_count = 0
+            warnings_count = 0
+            failure_descriptions = []
+            error_descriptions = []
+            warning_descriptions = []
+            previous_package_name = package_name
+            continue
+
+        if uds_capability_wide_errors(line):
+            uds_capability_wide_errors_count = 1
+            uds_capability_wide_error_descriptions = [
+                "Not all pods have the istio sidecar"
+            ]
+            continue
+        else:
+            # Match and extract counts for failures, errors, and warnings
+            match = re.match(r"^(❌|⚠️|⛔)\s+(\d+)\s+([a-z]+)\s+found$", line)
+            if match:
+                count = int(match.group(2))
+                type_ = match.group(3)
+                if type_ == "errors":
+                    errors_count = count
+                elif type_ == "warnings":
+                    warnings_count = count
+                elif type_ == "failures":
+                    failures_count = count
+                continue
+
+            # Match and collect issue descriptions
+            match = re.match(r"^(❌|⚠️|⛔)\s+(.*)$", line)
+            if match:
+                emoji = match.group(1)
+                description = match.group(2)
+                if emoji == "❌":
+                    error_descriptions.append(description)
+                elif emoji == "⚠️":
+                    warning_descriptions.append(description)
+                elif emoji == "⛔":
+                    failure_descriptions.append(description)
+                continue
+
+    # Print the last package's information
+    if previous_package_name is not None:
+        print_package_info(
+            previous_package_name,
+            failures_count,
+            errors_count,
+            warnings_count,
+            failure_descriptions,
+            error_descriptions,
+            warning_descriptions,
+            uds_capability_wide_errors_count,
+        )
+        if uds_capability_wide_errors_count >= 1:
+            print("-----------------------------")
+            if os.getenv("CI") == "true":
+                print("::group::UDS Capability-Wide Issues")
+                print("::error::", end="")
+            print("UDS Capability Issues")
+            print("\n❌ Error Descriptions:")
+            for desc in uds_capability_wide_error_descriptions:
+                print(f"  - {desc}")
+            if os.getenv("CI") == "true":
+                print("::endgroup::")
+
+
+if __name__ == "__main__":
+    main()
+    # Print the final ending separator
+    print("-----------------------------")
diff --git a/.github/workflows/e2e-llama-cpp-python.yaml b/.github/workflows/e2e-llama-cpp-python.yaml
index e3d573bba..b3019819f 100644
--- a/.github/workflows/e2e-llama-cpp-python.yaml
+++ b/.github/workflows/e2e-llama-cpp-python.yaml
@@ -32,6 +32,7 @@ on:
 
       # Ignore local development files
       - "!.pre-commit-config.yaml"
+      - "!tasks.yaml"
 
       # Ignore non e2e tests changes
       - "!tests/pytest/**"
@@ -56,6 +57,11 @@ jobs:
     runs-on: ai-ubuntu-big-boy-8-core
     if: ${{ !github.event.pull_request.draft }}
 
+    permissions:
+      contents: read
+      packages: read
+      id-token: write # This is needed for OIDC federation.
+
     steps:
       - name: Checkout Repo
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -69,6 +75,7 @@ jobs:
           registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
           registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
           ghToken: ${{ secrets.GITHUB_TOKEN }}
+          chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
       - name: Setup API and Supabase
         uses: ./.github/actions/lfai-core
diff --git a/.github/workflows/e2e-playwright.yaml b/.github/workflows/e2e-playwright.yaml
index 7200155fe..ddf9da1c8 100644
--- a/.github/workflows/e2e-playwright.yaml
+++ b/.github/workflows/e2e-playwright.yaml
@@ -34,6 +34,7 @@ on:
 
       # Ignore local development files
       - "!.pre-commit-config.yaml"
+      - "!tasks.yaml"
 
       # Ignore non e2e tests changes
       - "!tests/pytest/**"
@@ -57,6 +58,11 @@ jobs:
     runs-on: ai-ubuntu-big-boy-8-core
     if: ${{ !github.event.pull_request.draft }}
 
+    permissions:
+      contents: read
+      packages: read
+      id-token: write # This is needed for OIDC federation.
+
     steps:
         - name: Checkout Repo
           uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -82,6 +88,7 @@ jobs:
             registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
             registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
             ghToken: ${{ secrets.GITHUB_TOKEN }}
+            chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
         - name: Create Test User
           run: |
@@ -120,7 +127,7 @@ jobs:
         - name: UI/API/Supabase E2E Playwright Tests
           run: |
             cp src/leapfrogai_ui/.env.example src/leapfrogai_ui/.env
-            rm src/leapfrogai_ui/tests/global.teardown.ts 
+            rm src/leapfrogai_ui/tests/global.teardown.ts
             mkdir -p src/leapfrogai_ui/playwright/.auth
             SERVICE_ROLE_KEY=$(uds zarf tools kubectl get secret -n leapfrogai supabase-bootstrap-jwt -o jsonpath={.data.service-key} | base64 -d)
             echo "::add-mask::$SERVICE_ROLE_KEY"
diff --git a/.github/workflows/e2e-text-backend-full-cpu.yaml b/.github/workflows/e2e-text-backend-full-cpu.yaml
index 6e8507ae3..9e7faf01f 100644
--- a/.github/workflows/e2e-text-backend-full-cpu.yaml
+++ b/.github/workflows/e2e-text-backend-full-cpu.yaml
@@ -32,6 +32,7 @@ on:
 
       # Ignore local development files
       - "!.pre-commit-config.yaml"
+      - "!tasks.yaml"
 
       # Ignore non e2e tests changes
       - "!tests/pytest/**"
@@ -57,6 +58,11 @@ jobs:
     runs-on: ai-ubuntu-big-boy-8-core
     if: ${{ !github.event.pull_request.draft }}
 
+    permissions:
+      contents: read
+      packages: read
+      id-token: write # This is needed for OIDC federation.
+
     steps:
         - name: Checkout Repo
           uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -69,6 +75,8 @@ jobs:
           with:
             registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
             registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
+            ghToken: ${{ secrets.GITHUB_TOKEN }}
+            chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
         - name: Setup LFAI-API and Supabase
           uses: ./.github/actions/lfai-core
@@ -97,5 +105,7 @@ jobs:
         # Test
         ##########
         - name: Test Text Backend
+          env:
+              LEAPFROGAI_MODEL: llama-cpp-python
           run: |
             python -m pytest ./tests/e2e/test_text_backend_full.py -v
diff --git a/.github/workflows/e2e-text-embeddings.yaml b/.github/workflows/e2e-text-embeddings.yaml
index 20f7eb97a..3742de352 100644
--- a/.github/workflows/e2e-text-embeddings.yaml
+++ b/.github/workflows/e2e-text-embeddings.yaml
@@ -32,6 +32,7 @@ on:
 
       # Ignore local development files
       - "!.pre-commit-config.yaml"
+      - "!tasks.yaml"
 
       # Ignore non e2e tests changes
       - "!tests/pytest/**"
@@ -58,6 +59,11 @@ jobs:
     runs-on: ai-ubuntu-big-boy-8-core
     if: ${{ !github.event.pull_request.draft }}
 
+    permissions:
+      contents: read
+      packages: read
+      id-token: write # This is needed for OIDC federation.
+
     steps:
         - name: Checkout Repo
           uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -71,6 +77,7 @@ jobs:
             registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
             registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
             ghToken: ${{ secrets.GITHUB_TOKEN }}
+            chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
         - name: Setup LFAI-API and Supabase
           uses: ./.github/actions/lfai-core
diff --git a/.github/workflows/e2e-vllm.yaml b/.github/workflows/e2e-vllm.yaml
index 07e9f046f..6f89948ad 100644
--- a/.github/workflows/e2e-vllm.yaml
+++ b/.github/workflows/e2e-vllm.yaml
@@ -32,6 +32,7 @@ on:
 
       # Ignore local development files
       - "!.pre-commit-config.yaml"
+      - "!tasks.yaml"
 
       # Ignore non e2e tests changes
       - "!tests/pytest/**"
@@ -58,6 +59,11 @@ jobs:
     runs-on: ai-ubuntu-big-boy-8-core
     if: ${{ !github.event.pull_request.draft }}
 
+    permissions:
+      contents: read
+      packages: read
+      id-token: write # This is needed for OIDC federation.
+
     steps:
         - name: Checkout Repo
           uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -73,7 +79,7 @@ jobs:
             registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
             registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
             ghToken: ${{ secrets.GITHUB_TOKEN }}
-            udsCliVersion: 0.14.0
+            chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
         ##########
         # vLLM
@@ -82,4 +88,4 @@ jobs:
         ##########
         - name: Build vLLM
           run: |
-            make build-vllm LOCAL_VERSION=e2e-test
+            make build-vllm LOCAL_VERSION=e2e-test ZARF_CONFIG=packages/vllm/zarf-config.yaml
diff --git a/.github/workflows/e2e-whisper.yaml b/.github/workflows/e2e-whisper.yaml
index dee2cf45a..90e94106e 100644
--- a/.github/workflows/e2e-whisper.yaml
+++ b/.github/workflows/e2e-whisper.yaml
@@ -32,6 +32,7 @@ on:
 
       # Ignore local development files
       - "!.pre-commit-config.yaml"
+      - "!tasks.yaml"
 
       # Ignore non e2e tests changes
       - "!tests/pytest/**"
@@ -56,6 +57,11 @@ jobs:
     runs-on: ai-ubuntu-big-boy-8-core
     if: ${{ !github.event.pull_request.draft }}
 
+    permissions:
+      contents: read
+      packages: read
+      id-token: write # This is needed for OIDC federation.
+
     steps:
         - name: Checkout Repo
           uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -71,6 +77,7 @@ jobs:
             registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
             registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
             ghToken: ${{ secrets.GITHUB_TOKEN }}
+            chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
         - name: Setup LFAI-API and Supabase
           uses: ./.github/actions/lfai-core
diff --git a/.github/workflows/nightly-snapshot-release.yaml b/.github/workflows/nightly-snapshot-release.yaml
index 7be66a934..da6abcdef 100644
--- a/.github/workflows/nightly-snapshot-release.yaml
+++ b/.github/workflows/nightly-snapshot-release.yaml
@@ -6,11 +6,11 @@ on:
   workflow_dispatch: # trigger manually as needed
   pull_request:
     types:
-      - opened # default trigger
-      - reopened # default trigger
-      - synchronize # default trigger
-      - ready_for_review # don't run on draft PRs
-      - milestoned # allows us to trigger on bot PRs
+      - opened            # default trigger
+      - reopened          # default trigger
+      - synchronize       # default trigger
+      - ready_for_review  # don't run on draft PRs
+      - milestoned        # allows us to trigger on bot PRs
     paths:
       - .github/workflows/nightly-snapshot-release.yaml
 
@@ -23,10 +23,8 @@ defaults:
     shell: bash
 
 env:
-  # x-release-please-start-version
-  LEAPFROGAI_VERSION: 0.13.0
-  # x-release-please-end
   SNAPSHOT_VERSION: snapshot-latest
+  SNAPSHOT_SUB_REPOSITORY: /uds/snapshots/
 
 permissions:
   contents: read
@@ -47,7 +45,7 @@ jobs:
         uses: ./.github/actions/release
         with:
           releaseTag: ${{ env.SNAPSHOT_VERSION }}
-          subRepository: /uds/snapshots/
+          subRepository: ${{ env.SNAPSHOT_SUB_REPOSITORY }}
           registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
           registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
           ghToken: ${{ secrets.GITHUB_TOKEN }}
@@ -65,19 +63,29 @@ jobs:
       id-token: write # This is needed for OIDC federation.
 
     steps:
-      - name: Checkout Repo (v${{ env.LEAPFROGAI_VERSION }})
+      # Checkout main just to see the latest release in the release-please manifest
+      - name: Checkout Repo (main)
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
-          ref: v${{ env.LEAPFROGAI_VERSION }}
-
-      - name: Setup Python (v${{ env.LEAPFROGAI_VERSION }})
-        uses: ./.github/actions/python
+          ref: main
 
-      - name: Install Dev Dependencies
+      - name: Get Latest Release Version
+        id: get_version
         run: |
-          python -m pip install ".[dev]" ".[dev-vllm]" ".[dev-whisper]"
+          LFAI_VERSION=$(jq -r '.["."]' .github/.release-please-manifest.json)
+          echo "LFAI_VERSION=$LFAI_VERSION" >> $GITHUB_OUTPUT
+
+      ################
+      # LATEST RELEASE
+      ################
+
+      # Checkout the latest release in the release-please manifest
+      - name: Checkout Repo (v${{ steps.get_version.outputs.LFAI_VERSION }})
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          ref: v${{ steps.get_version.outputs.LFAI_VERSION }}
 
-      - name: Setup UDS Cluster
+      - name: Setup UDS Cluster (v${{ steps.get_version.outputs.LFAI_VERSION }})
         uses: ./.github/actions/uds-cluster
         with:
           registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
@@ -85,38 +93,40 @@ jobs:
           ghToken: ${{ secrets.GITHUB_TOKEN }}
           chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
-      # This is needed due to delay in tagged release versus
-      # package publishing and the latest versions of each package in the UDS bundle
-      - name: Mutation of the UDS Bundle
+      # This is needed due to delay in tagged releases versus the version refs within the UDS bundles
+      - name: Mutation of the UDS Bundle (v${{ steps.get_version.outputs.LFAI_VERSION }})
         run: |
-          uds zarf tools yq -i '.metadata.version = "v${{ env.LEAPFROGAI_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml
+          uds zarf tools yq -i '.metadata.version = "v${{ steps.get_version.outputs.LFAI_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml
 
-          uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ env.LEAPFROGAI_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml
+          uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ steps.get_version.outputs.LFAI_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml
 
-      - name: Create and Deploy UDS Bundle (v${{ env.LEAPFROGAI_VERSION }})
+      - name: Create and Deploy UDS Bundle (v${{ steps.get_version.outputs.LFAI_VERSION }})
         run: |
           cd bundles/latest/cpu
           uds create . --confirm && \
-            uds deploy uds-bundle-leapfrogai-amd64-v${{ env.LEAPFROGAI_VERSION }}.tar.zst --confirm --no-progress && \
-            rm -rf uds-bundle-leapfrogai-amd64-v${{ env.LEAPFROGAI_VERSION }}.tar.zst && \
+            uds deploy uds-bundle-leapfrogai-amd64-v${{ steps.get_version.outputs.LFAI_VERSION }}.tar.zst --confirm --no-progress && \
+            rm -rf uds-bundle-leapfrogai-amd64-v${{ steps.get_version.outputs.LFAI_VERSION }}.tar.zst && \
             docker system prune -af
 
+      #################
+      # MAIN (SNAPSHOT)
+      #################
+
       - name: Checkout Repo (main)
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           ref: main
 
-      - name: Print the Commit SHA
+      - name: Print the Commit SHA (main)
         run: |
           COMMIT_SHA=$(git rev-parse HEAD)
           echo "The latest commit on the main branch is: $COMMIT_SHA"
 
-      - name: Install Dev Dependencies (main)
-        run: |
-          python -m pip install ".[dev]" ".[dev-vllm]" ".[dev-whisper]" --force-reinstall --no-cache-dir
+      - name: Setup Python (main)
+        uses: ./.github/actions/python
 
       # Set UDS CPU bundle refs and repositories to snapshot-latest
-      - name: Mutation of the UDS Bundle
+      - name: Mutation of the UDS Bundle (main)
         run: |
           uds zarf tools yq -i '.metadata.version = "${{ env.SNAPSHOT_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml
 
@@ -124,7 +134,7 @@ jobs:
 
           uds zarf tools yq -i '.packages[].repository |= sub("/uds/", "/uds/snapshots/")' bundles/latest/cpu/uds-bundle.yaml
 
-      - name: Create and Deploy UDS Bundle (${{ env.SNAPSHOT_VERSION }})
+      - name: Create and Deploy UDS Bundle (main)
         run: |
           cd bundles/latest/cpu
           uds create . --confirm && \
@@ -132,6 +142,10 @@ jobs:
             rm -rf uds-bundle-leapfrogai-amd64-${{ env.SNAPSHOT_VERSION }}.tar.zst && \
             docker system prune -af
 
+      #########
+      # TESTING
+      #########
+
       - name: Generate Secrets
         id: generate_secrets
         run: |
@@ -156,6 +170,7 @@ jobs:
         env:
           ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }}
           SERVICE_KEY: ${{ steps.generate_secrets.outputs.SERVICE_KEY }}
+          LEAPFROGAI_MODEL: llama-cpp-python
         run: |
           python -m pytest -vvv -s ./tests/e2e
 
diff --git a/.github/workflows/nightly-uds-badge-verification.yaml b/.github/workflows/nightly-uds-badge-verification.yaml
new file mode 100644
index 000000000..6be419ebb
--- /dev/null
+++ b/.github/workflows/nightly-uds-badge-verification.yaml
@@ -0,0 +1,94 @@
+name: nightly-uds-badge-verification
+
+on:
+  schedule:
+    - cron: "0 11 * * *" # Runs daily at 3 AM PST
+  workflow_dispatch: # trigger manually as needed
+  pull_request:
+    paths:
+      - .github/workflows/nightly-uds-badge-verification.yaml
+      - tasks.yaml
+
+concurrency:
+  group: nightly-uds-badge-verification-${{ github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  SNAPSHOT_VERSION: snapshot-latest
+
+permissions:
+  contents: read
+  packages: read
+  id-token: write # This is needed for OIDC federation.
+
+jobs:
+  uds-badge-verification:
+    runs-on: ai-ubuntu-big-boy-8-core
+    name: nightly_uds_badge_verification
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          ref: main
+
+      - name: Setup UDS Cluster
+        uses: ./.github/actions/uds-cluster
+        with:
+          registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
+          registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
+          ghToken: ${{ secrets.GITHUB_TOKEN }}
+          chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
+
+      - name: Print the Commit SHA
+        run: |
+          COMMIT_SHA=$(git rev-parse HEAD)
+          echo "The latest commit on the main branch is: $COMMIT_SHA"
+
+      # Set UDS CPU bundle refs and repositories to snapshot-latest
+      - name: Mutation of the UDS Bundle
+        run: |
+          uds zarf tools yq -i '.metadata.version = "${{ env.SNAPSHOT_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml
+
+          uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ env.SNAPSHOT_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml
+
+          uds zarf tools yq -i '.packages[].repository |= sub("/uds/", "/uds/snapshots/")' bundles/latest/cpu/uds-bundle.yaml
+
+      - name: Create and Deploy UDS Bundle (${{ env.SNAPSHOT_VERSION }})
+        run: |
+          cd bundles/latest/cpu
+          uds create . --confirm && \
+          uds deploy uds-bundle-leapfrogai-amd64-${{ env.SNAPSHOT_VERSION }}.tar.zst --confirm --no-progress && \
+          rm -rf uds-bundle-leapfrogai-amd64-${{ env.SNAPSHOT_VERSION }}.tar.zst && \
+          docker system prune -af
+
+      # Workaround for handling emojis in the upstream badge verification UDS task
+      - name: Set Locale to UTF-8
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y locales
+          sudo locale-gen en_US.UTF-8
+          export LANG=en_US.UTF-8
+          export LANGUAGE=en_US:en
+          export LC_ALL=en_US.UTF-8
+
+      # Setup Python for the report cleaning script in the next step
+      - name: Set up Python
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version-file: "pyproject.toml"
+
+      - name: Run UDS Badge Verification Task
+        run: |
+          uds run nightly-uds-badge-verification --no-progress
+
+      - name: Archive UDS Badge Verification Report
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+        with:
+          name: uds-badge-verification-report
+          path: reports
+          retention-days: 7
diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
index 93d0f0832..fec72192b 100644
--- a/.github/workflows/pytest.yaml
+++ b/.github/workflows/pytest.yaml
@@ -31,7 +31,10 @@ on:
       - "!packages/ui/**"
 
 # Declare default permissions as read only.
-permissions: read-all
+permissions:
+  contents: read
+  packages: read
+  id-token: write # This is needed for OIDC federation.
 
 concurrency:
   group: pytest-integration-${{ github.ref }}
@@ -64,6 +67,7 @@ jobs:
         run: make test-api-unit
         env:
           LFAI_RUN_REPEATER_TESTS: true
+          DEV: true
 
   integration:
     runs-on: ai-ubuntu-big-boy-8-core
@@ -97,6 +101,7 @@ jobs:
           registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
           registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
           ghToken: ${{ secrets.GITHUB_TOKEN }}
+          chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
       - name: Setup API and Supabase
         uses: ./.github/actions/lfai-core
diff --git a/.github/workflows/uds-lint.yaml b/.github/workflows/uds-lint.yaml
index 8f2e6834c..cf7050cb8 100644
--- a/.github/workflows/uds-lint.yaml
+++ b/.github/workflows/uds-lint.yaml
@@ -46,3 +46,11 @@ jobs:
         run: |
           check-jsonschema bundles/latest/gpu/uds-bundle.yaml --schemafile uds.schema.json
           check-jsonschema bundles/latest/cpu/uds-bundle.yaml --schemafile uds.schema.json
+
+      - name: Download UDS Tasks Schema
+        run: curl -o tasks.schema.json https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.14.0/tasks.schema.json
+
+      - name: Validate tasks.yaml
+        if: always()
+        run: |
+          check-jsonschema tasks.yaml --schemafile tasks.schema.json
diff --git a/.github/workflows/e2e-registry1-weekly.yaml b/.github/workflows/weekly-registry1-flavor-test.yaml
similarity index 56%
rename from .github/workflows/e2e-registry1-weekly.yaml
rename to .github/workflows/weekly-registry1-flavor-test.yaml
index 65f4c5897..21d799c9b 100644
--- a/.github/workflows/e2e-registry1-weekly.yaml
+++ b/.github/workflows/weekly-registry1-flavor-test.yaml
@@ -1,8 +1,8 @@
-name: e2e-registry1-weekly
+name: weekly-registry1-flavor-test
 
 on:
   schedule:
-    - cron: "0 0 * * 6" # Run every Sunday at 12 AM EST
+    - cron: "0 8 * * 0" # Run every Sunday at 12 AM PST
   workflow_dispatch: # trigger manually as needed
   pull_request:
     types:
@@ -12,11 +12,11 @@ on:
       - ready_for_review # don't run on draft PRs
       - milestoned # allows us to trigger on bot PRs
     paths:
-      - .github/workflows/e2e-registry1-weekly.yaml
+      - .github/workflows/weekly-registry1-flavor-test.yaml
       - bundles/latest/**
 
 concurrency:
-  group: e2e-registry1-weekly-${{ github.ref }}
+  group: weekly-registry1-flavor-test-${{ github.ref }}
   cancel-in-progress: true
 
 defaults:
@@ -24,67 +24,98 @@ defaults:
     shell: bash
 
 jobs:
-  test-flavors:
+  registry1-flavor-test:
     runs-on: ai-ubuntu-big-boy-8-core
-    name: e2e_registry1_weekly
+    name: weekly_registry1_flavor_test
     if: ${{ !github.event.pull_request.draft }}
 
     permissions:
       contents: read
-      packages: write
+      packages: read
       id-token: write # This is needed for OIDC federation.
 
     steps:
-      - name: Checkout Repo
+      # Checkout main just to see the latest release in the release-please manifest
+      - name: Checkout Repo (main)
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
-          # x-release-please-start-version
-          ref: "caf4f9c3093a55a003b49fcbf05c03221be6a232" # 0.12.2 w/ integration tests turned-on
-          # x-release-please-end
+          ref: main
 
-      - name: Setup Python
-        uses: ./.github/actions/python
+      - name: Get Latest Release Version
+        id: get_version
+        run: |
+          LFAI_VERSION=$(jq -r '.["."]' .github/.release-please-manifest.json)
+          echo "LFAI_VERSION=$LFAI_VERSION" >> $GITHUB_OUTPUT
 
-      - name: Install API and SDK Dev Dependencies
-        run : |
-          make install
+      ################
+      # LATEST RELEASE
+      ################
+
+      - name: Checkout Repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          fetch-tags: true
+          ref: v${{ steps.get_version.outputs.LFAI_VERSION }}
 
-      - name: Setup UDS Cluster
-        uses: ./.github/actions/uds-cluster
+      - name: Setup UDS Environment
+        uses: defenseunicorns/uds-common/.github/actions/setup@24c8a2a48eeb33773b76b3587c489cb17496c9e0 # v0.12.0
         with:
           registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }}
           registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }}
           ghToken: ${{ secrets.GITHUB_TOKEN }}
-          udsCliVersion: 0.14.0
+          chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }}
 
-      - name: Create UDS Cluster
-        shell: bash
+      - name: Setup Python
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c #v5.0.0
+        with:
+          python-version-file: "pyproject.toml"
+
+      - name: Install Python Dependencies
+        run: pip install ".[dev]" "src/leapfrogai_api" "src/leapfrogai_sdk" --no-cache-dir
+
+      - name: Mutation of the Zarf Packages
         run: |
-          UDS_CONFIG=.github/config/uds-config.yaml make create-uds-cpu-cluster
+          uds zarf tools yq -i '
+            .components[].images[0] |= sub(":v[0-9\.]+$", ":v${{ steps.get_version.outputs.LFAI_VERSION }}")
+          ' packages/api/zarf.yaml
+          uds zarf tools yq -i '.api.image.tag = "v${{ steps.get_version.outputs.LFAI_VERSION }}"' packages/api/values/registry1-values.yaml
 
-      - name: Setup Playwright
+      - name: Print the Modified Zarf Packages
         run: |
-          npm --prefix src/leapfrogai_ui ci
-          npx --prefix src/leapfrogai_ui playwright install
+          cat packages/api/zarf.yaml
+          cat packages/api/values/registry1-values.yaml
 
-      - name: Create Registry1 Packages
+      - name: Create Registry1 Zarf Packages
         run: |
-          LOCAL_VERSION=registry1 FLAVOR=registry1 make build-api
+          uds zarf package create packages/api --set image_version="${{ steps.get_version.outputs.LFAI_VERSION }}" --flavor registry1 -a amd64 --confirm
 
       # Mutate UDS bundle definition to use Registry1 packages
-      - name: Mutation to Registry1 Bundle
-        # TODO: fix bundle path
+      # Mutate non-Registry1 packages to be the current tagged version
+      - name: Mutation of the UDS Bundle
         run: |
-          uds zarf tools yq -i '.packages[1] |= del(.repository)' bundles/latest/cpu/uds-bundle.yaml
-          uds zarf tools yq -i '.packages[1] |= .ref = "registry1"' bundles/latest/cpu/uds-bundle.yaml
-          uds zarf tools yq -i '.packages[1] |= .path = "../../../packages/api"' bundles/latest/cpu/uds-bundle.yaml
           uds zarf tools yq -i '.metadata.version = "registry1"' bundles/latest/cpu/uds-bundle.yaml
 
-      - name: Create and Deploy Bundle
+          uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ steps.get_version.outputs.LFAI_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml
+
+          uds zarf tools yq -i '.packages[1] |= del(.repository)' bundles/latest/cpu/uds-bundle.yaml
+          uds zarf tools yq -i '.packages[1] |= .ref = "${{ steps.get_version.outputs.LFAI_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml
+          uds zarf tools yq -i '.packages[1] |= .path = "../../../"' bundles/latest/cpu/uds-bundle.yaml
+
+      - name: Print the Modified UDS Bundle
+        run: |
+          cat bundles/latest/cpu/uds-config.yaml
+          cat bundles/latest/cpu/uds-bundle.yaml
+
+      - name: Create UDS Cluster
+        shell: bash
+        run: |
+          UDS_CONFIG=.github/config/uds-config.yaml make create-uds-cpu-cluster
+
+      - name: Create and Deploy Registry1 Bundle
         run: |
           cd bundles/latest/cpu
           uds create . --confirm && \
-            uds deploy uds-bundle-leapfrogai-amd64-registry1.tar.zst --confirm --no-progress && \
+            uds deploy uds-bundle-leapfrogai-amd64-registry1.tar.zst --confirm --no-progress --log-level debug && \
             rm -rf uds-bundle-leapfrogai-amd64-registry1.tar.zst && \
             docker system prune -af
 
@@ -107,32 +138,19 @@ jobs:
           echo "ANON_KEY is set: ${{ steps.generate_secrets.outputs.ANON_KEY != '' }}"
           echo "SERVICE_KEY is set: ${{ steps.generate_secrets.outputs.SERVICE_KEY != '' }}"
 
-      - name: Run Integration Tests
-        env:
-          SUPABASE_ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }}
-          SUPABASE_PASS: ${{ steps.generate_secrets.outputs.FAKE_PASSWORD }}
-          SUPABASE_EMAIL: integration@uds.dev
-          SUPABASE_URL: https://supabase-kong.uds.dev
-          # Turn off NIAH tests that are not applicable for integration testing using the Repeater model
-          LFAI_RUN_NIAH_TESTS: "false"
-        run: |
-          uds zarf connect --name=llama-cpp-python-model --namespace=leapfrogai --local-port=50051 --remote-port=50051 &
-          while ! nc -z localhost 50051; do sleep 1; done
-
-          make test-user-pipeline
-          env $(cat .env | xargs) python -m pytest -v -s tests/integration/api
-
       # Backends
       - name: Run Backend E2E Tests
         env:
           ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }}
           SERVICE_KEY: ${{ steps.generate_secrets.outputs.SERVICE_KEY }}
+          LEAPFROGAI_MODEL: llama-cpp-python
+        run: |
+          python -m pytest -vvv -s ./tests/e2e
+
+      - name: Setup Playwright
         run: |
-          python -m pytest ./tests/e2e/test_llama.py -vv
-          python -m pytest ./tests/e2e/test_text_embeddings.py -vv
-          python -m pytest ./tests/e2e/test_whisper.py -vv
-          python -m pytest ./tests/e2e/test_supabase.py -vv
-          python -m pytest ./tests/e2e/test_api.py -vv
+          npm --prefix src/leapfrogai_ui ci
+          npx --prefix src/leapfrogai_ui playwright install
 
       - name: Run Playwright E2E Tests
         env:
@@ -156,3 +174,12 @@ jobs:
           name: playwright-report
           path: src/leapfrogai_ui/e2e-report/
           retention-days: 30
+
+      - name: Get Cluster Debug Information
+        id: debug
+        if: ${{ !cancelled() }}
+        uses: defenseunicorns/uds-common/.github/actions/debug-output@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1
+
+      - name: Get Cluster Debug Information
+        if: ${{ !cancelled() && steps.debug.conclusion == 'success' }}
+        uses: defenseunicorns/uds-common/.github/actions/save-logs@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1
diff --git a/.gitignore b/.gitignore
index 645bd6ff5..d0c8a20f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ node_modules
 package.json
 package-lock.json
 **/*.schema.json
+reports
 
 # local model and tokenizer files
 *.bin
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6caadd6c8..401bcba03 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -137,3 +137,26 @@ repos:
         files: "uds-bundle.yaml"
         types: [yaml]
         args: ["--schemafile", "uds-v0.14.0.schema.json"]
+
+  # UDS TASKS CHECK
+  - repo: local
+    hooks:
+      - id: download-schema
+        name: "Download UDS Tasks Schema"
+        entry: |
+          bash -c 'FILE="tasks-v0.14.0.schema.json"
+          if [ -f "$(git rev-parse --show-toplevel)/$FILE" ]; then
+            echo "$FILE already exists in the root of the git project, skipping download."
+          else
+            curl -o "$FILE" https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.14.0/tasks.schema.json
+          fi'
+        language: system
+
+  - repo: https://github.com/python-jsonschema/check-jsonschema
+    rev: 0.14.0
+    hooks:
+      - id: check-jsonschema
+        name: "Validate UDS Bundles Against Schema"
+        files: "tasks.yaml"
+        types: [yaml]
+        args: ["--schemafile", "tasks-v0.14.0.schema.json"]
diff --git a/Makefile b/Makefile
index bf8afb315..da9266246 100644
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,10 @@
 ARCH ?= amd64
+FLAVOR ?= upstream
 REG_PORT ?= 5000
 REG_NAME ?= registry
 LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
 DOCKER_FLAGS :=
 ZARF_FLAGS :=
-FLAVOR := upstream
 SILENT_DOCKER_FLAGS := --quiet
 SILENT_ZARF_FLAGS := --no-progress -l warn --no-color
 MAX_JOBS := 4
@@ -55,24 +55,34 @@ build-supabase: local-registry docker-supabase
 docker-api: local-registry sdk-wheel
 	@echo $(DOCKER_FLAGS)
 	@echo $(ZARF_FLAGS)
-ifeq ($(FLAVOR),upstream)
+
 	## Build the API image (and tag it for the local registry)
 	docker build ${DOCKER_FLAGS} --platform=linux/${ARCH} --build-arg LOCAL_VERSION=${LOCAL_VERSION} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION} -f packages/api/Dockerfile .
 	docker tag ghcr.io/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION} localhost:${REG_PORT}/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION}
-endif
+
 	## Build the migration container for this version of the API
 	docker build ${DOCKER_FLAGS} --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION} -f Dockerfile.migrations --build-arg="MIGRATIONS_DIR=packages/api/supabase/migrations" .
 	docker tag ghcr.io/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION} localhost:${REG_PORT}/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION}
 
-build-api: local-registry docker-api ## Build the leapfrogai_api container and Zarf package
+## If registry1, don't locally Docker-build anything
+ifeq ($(FLAVOR),upstream)
+    DOCKER_TARGETS := local-registry docker-api
+else
+    DOCKER_TARGETS :=
+endif
+
+build-api: $(DOCKER_TARGETS) ## Build the leapfrogai_api container and Zarf package
+	## Only push to local registry and build if this is an upstream-flavored package
 ifeq ($(FLAVOR),upstream)
 	## Push the images to the local registry (Zarf is super slow if the image is only in the local daemon)
 	docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION}
-endif
 	docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION}
-
 	## Build the Zarf package
 	uds zarf package create packages/api --flavor ${FLAVOR} -a ${ARCH} -o packages/api --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm
+else
+	## Build the registry1 Zarf package
+	ZARF_CONFIG=packages/api/zarf-config.yaml uds zarf package create packages/api --flavor ${FLAVOR} -a ${ARCH} -o packages/api ${ZARF_FLAGS} --confirm
+endif
 
 docker-ui:
 	## Build the UI image (and tag it for the local registry)
@@ -113,7 +123,7 @@ build-vllm: local-registry docker-vllm ## Build the vllm container and Zarf pack
 	docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm
+	ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm
 
 docker-text-embeddings: sdk-wheel
 	## Build the image (and tag it for the local registry)
@@ -253,7 +263,7 @@ silent-deploy-llama-cpp-python-package:
 silent-deploy-vllm-package:
 	@echo "Starting VLLM deployment..."
 	@mkdir -p .logs
-	@uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1
+	@ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1
 	@echo "VLLM deployment completed"
 
 silent-deploy-text-embeddings-package:
diff --git a/README.md b/README.md
index 7c09b075b..4e4b1c161 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,9 @@
 ![LeapfrogAI](https://github.com/defenseunicorns/leapfrogai/raw/main/docs/imgs/leapfrogai.png)
 
 [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/defenseunicorns/leapfrogai/badge)](https://api.securityscorecards.dev/projects/github.com/defenseunicorns/leapfrogai)
+[![Nightly Snapshot Tests](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-snapshot-release.yaml/badge.svg)](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-snapshot-release.yaml)
+[![Nightly Made for UDS Test](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-uds-badge-verification.yaml/badge.svg)](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-uds-badge-verification.yaml)
+[![Weekly Registry1 Test](https://github.com/defenseunicorns/leapfrogai/actions/workflows/weekly-registry1-flavor-test.yaml/badge.svg)](https://github.com/defenseunicorns/leapfrogai/actions/workflows/weekly-registry1-flavor-test.yaml)
 
 ## Table of Contents
 
diff --git a/bundles/dev/gpu/uds-config.yaml b/bundles/dev/gpu/uds-config.yaml
index 9ad6cfdb4..1ef7a2634 100644
--- a/bundles/dev/gpu/uds-config.yaml
+++ b/bundles/dev/gpu/uds-config.yaml
@@ -9,8 +9,31 @@ variables:
     gpu_limit: 0  # runs on CPU until GPU limit is increased
 
   vllm:
-    gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
-    #tensor_parallel_size: 1   # TODO: reintroduce when vllm changes get pulled in
+    trust_remote_code: "True"
+    tensor_parallel_size: "1"
+    enforce_eager: "False"
+    gpu_memory_utilization: "0.90"
+    worker_use_ray: "True"
+    engine_use_ray: "True"
+    quantization: "None"
+    load_format: "auto"
+    # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+    max_context_length: "32768"
+    stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+    prompt_format_chat_system: "SYSTEM: {}\n"
+    prompt_format_chat_user: "USER: {}\n"
+    prompt_format_chat_assistant: "ASSISTANT: {}\n"
+    temperature: "0.1"
+    top_p: "1.0"
+    top_k: "0"
+    repetition_penalty: "1.0"
+    max_new_tokens: "8192"
+    # Pod deployment configuration
+    gpu_limit: "1"
+    gpu_runtime: "nvidia"
+    pvc_size: "15Gi"
+    pvc_access_mode: "ReadWriteOnce"
+    pvc_storage_class: "local-path"
 
   supabase:
     domain: "uds.dev"
diff --git a/bundles/latest/cpu/uds-bundle.yaml b/bundles/latest/cpu/uds-bundle.yaml
index 747645ae3..00327dbec 100644
--- a/bundles/latest/cpu/uds-bundle.yaml
+++ b/bundles/latest/cpu/uds-bundle.yaml
@@ -4,35 +4,35 @@ kind: UDSBundle
 metadata:
   name: leapfrogai
   description: A UDS bundle for deploying LeapfrogAI
-  version: 0.12.2-upstream
+  version: 0.13.1-upstream
 
 packages:
   # Supabase backend for the UI and API to interface with Postgresql
   - name: supabase
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/supabase
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # API
   - name: leapfrogai-api
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-api
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # Chat Model
   - name: llama-cpp-python
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/llama-cpp-python
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # Text Embeddings Model
   - name: text-embeddings
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/text-embeddings
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # Transcription Model
   - name: whisper
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/whisper
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # UI
   - name: leapfrogai-ui
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-ui
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
diff --git a/bundles/latest/gpu/uds-bundle.yaml b/bundles/latest/gpu/uds-bundle.yaml
index 3867749a4..ab2a9e0f5 100644
--- a/bundles/latest/gpu/uds-bundle.yaml
+++ b/bundles/latest/gpu/uds-bundle.yaml
@@ -4,35 +4,35 @@ kind: UDSBundle
 metadata:
   name: leapfrogai
   description: A UDS bundle for deploying LeapfrogAI
-  version: 0.12.2-upstream
+  version: 0.13.1-upstream
 
 packages:
   # Supabase backend for the UI and API to interface with Postgresql
   - name: supabase
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/supabase
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # OpenAI-like API
   - name: leapfrogai-api
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-api
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # Model for generic chat and summarization
   - name: vllm
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/vllm
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # Model for providing vector embeddings for text
   - name: text-embeddings
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/text-embeddings
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # Model for converting audio to text
   - name: whisper
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/whisper
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
 
   # UI
   - name: leapfrogai-ui
     repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-ui
-    ref: 0.12.2-upstream
+    ref: 0.13.1-upstream
diff --git a/bundles/latest/gpu/uds-config.yaml b/bundles/latest/gpu/uds-config.yaml
index 9ad6cfdb4..1ef7a2634 100644
--- a/bundles/latest/gpu/uds-config.yaml
+++ b/bundles/latest/gpu/uds-config.yaml
@@ -9,8 +9,31 @@ variables:
     gpu_limit: 0  # runs on CPU until GPU limit is increased
 
   vllm:
-    gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
-    #tensor_parallel_size: 1   # TODO: reintroduce when vllm changes get pulled in
+    trust_remote_code: "True"
+    tensor_parallel_size: "1"
+    enforce_eager: "False"
+    gpu_memory_utilization: "0.90"
+    worker_use_ray: "True"
+    engine_use_ray: "True"
+    quantization: "None"
+    load_format: "auto"
+    # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+    max_context_length: "32768"
+    stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+    prompt_format_chat_system: "SYSTEM: {}\n"
+    prompt_format_chat_user: "USER: {}\n"
+    prompt_format_chat_assistant: "ASSISTANT: {}\n"
+    temperature: "0.1"
+    top_p: "1.0"
+    top_k: "0"
+    repetition_penalty: "1.0"
+    max_new_tokens: "8192"
+    # Pod deployment configuration
+    gpu_limit: "1"
+    gpu_runtime: "nvidia"
+    pvc_size: "15Gi"
+    pvc_access_mode: "ReadWriteOnce"
+    pvc_storage_class: "local-path"
 
   supabase:
     domain: "uds.dev"
diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md
index 897bfaf5d..98343ef7f 100644
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@@ -13,20 +13,20 @@ Please first see the pre-requisites listed on the LeapfrogAI documentation websi
 
 It is **_HIGHLY RECOMMENDED_** that PyEnv be installed on your machine, and a new virtual environment is created for every new development branch.
 
-Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.6:
+Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.9:
 
   ```bash
   # install the correct python version
-  pyenv install 3.11.6
+  pyenv install 3.11.9
 
   # create a new virtual environment named "leapfrogai"
-  pyenv virtualenv 3.11.6 leapfrogai
+  pyenv virtualenv 3.11.9 leapfrogai
 
   # activate the virtual environment
   pyenv activate leapfrogai
   ```
 
-If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.6:
+If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.9:
 
   ```bash
   sudo apt-get install build-essential zlib1g-dev libffi-dev \
@@ -62,6 +62,52 @@ Many of the directories and sub-directories within this project contain Make tar
 
 Please refer to each Makefile for more arguments and details on what each target does and is dependent on.
 
+## UDS Tasks
+
+UDS tasks use the UDS CLI runner, and are defined in the root `tasks.yaml` file.
+
+Currently, the only tasks within the file are for checking the progress of the LeapfrogAI towards the `Made for UDS` packaging standards. To run the task verification task you must have a [UDS Kubernetes cluster](../packages/k3d-gpu/README.md) and LeapfrogAI (GPU or CPU) deployed. After deploying both major capabilities, you can execute the following:
+
+```bash
+uds run nightly-uds-badge-verification --no-progress
+```
+
+You should get an output similar to this, depending on how many components of LeapfrogAI are actually deployed:
+
+```bash
+  •  Running "Create Reports Directory"
+
+  ✔  Completed "Create Reports Directory"
+
+  •  Running "Run UDS Badge Verification Task"
+
+  ✔  Completed "Run UDS Badge Verification Task"
+
+  •  Running "Clean Up Final Report"
+-----------------------------
+Package: leapfrogai-api
+
+❌ Errors: 4
+⚠️  Warnings: 3
+
+❌ Error Descriptions:
+  - Endpoint leapfrogai-api.uds.dev is returning 404
+  - Not all applicable network policies are using selectors
+  - Not all applicable network policies are using ports
+  - No monitors defined
+
+⚠️  Warning Descriptions:
+  - Version is not consistent across flavors and package
+  - Network policies with 'remoteGenerated: Anywhere' are present, review needed
+  - No SSO configuration found, review needed
+-----------------------------
+UDS Capability Issues
+
+❌ Error Descriptions:
+  - Not all pods have the istio sidecar
+-----------------------------
+```
+
 ## Environment Variables
 
 Be wary of `*config*.yaml` or `.env*` files that are in individual components of the stack. The component's README will usually tell the developer when to fill them out or supply environment variables to a script.
@@ -81,6 +127,7 @@ uds zarf tools registry prune --confirm
 
 # create and deploy the new package
 # FLAVOR can be upstream (default) or registry1 - see README for availability details
+# See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md)
 LOCAL_VERSION=dev FLAVOR=upstream REGISTRY_PORT=5000 ARCH=amd64 make build-api
 LOCAL_VERSION=dev FLAVOR=upstream REGISTRY_PORT=5000 ARCH=amd64 make deploy-api
 ```
@@ -107,6 +154,7 @@ uds zarf package deploy zarf-package-*.tar.zst --confirm
 
     ```bash
     # FLAVOR can be upstream (default) or registry1 - see README for availability details
+    # See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md)
     LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-cpu    # ui, api, llama-cpp-python, text-embeddings, whisper, supabase
     # OR
     LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-gpu    # ui, api, vllm, text-embeddings, whisper, supabase
@@ -120,6 +168,7 @@ uds zarf package deploy zarf-package-*.tar.zst --confirm
 
     ```bash
     # FLAVOR can be upstream (default) or registry1 - see README for availability details
+    # See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md)
     LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-ui
     LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-api
     LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-supabase
@@ -154,7 +203,7 @@ Although not provided in the example UDS bundle manifests found in this reposito
   - name: leapfrogai-api
     repository: ghcr.io/defenseunicorns/packages/leapfrogai/leapfrogai-api
     # x-release-please-start-version
-    ref: 0.12.2
+    ref: 0.13.1
     # x-release-please-end
 
     # THE BELOW LINES WERE ADDED FOR DEMONSTRATION PURPOSES
@@ -188,6 +237,7 @@ To demonstrate what this would look like for an Apple Silicon Mac:
 
 ```bash
 # FLAVOR can be upstream (default) or registry1 - see README for availability details
+# See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md)
 REG_PORT=5001 ARCH=arm64 LOCAL_VERSION=dev FLAVOR=upstream make build-cpu
 ```
 
@@ -195,6 +245,7 @@ To demonstrate what this would look like for an older Intel Mac:
 
 ```bash
 # FLAVOR can be upstream (default) or registry1 - see README for availability details
+# See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md)
 REG_PORT=5001 ARCH=arm64 LOCAL_VERSION=dev FLAVOR=upstream make build-cpu
 ```
 
diff --git a/mk-clean.mk b/mk-clean.mk
index ff7e8c61d..4ca00ae89 100644
--- a/mk-clean.mk
+++ b/mk-clean.mk
@@ -15,8 +15,8 @@ clean-artifacts: # Zarf packages, UDS bundles, Python build artifacts, etc.
 
 clean-cache:
 	-rm -rf ./**/__pycache__ ./**/*/__pycache__ ./**/**/*/__pycache__
-	-rm -rf ./**/*/.ruff_cache ./**/.ruff_cache
-	-rm -rf ./**/.pytest_cache ./**/*/.pytest_cache
+	-rm -rf ./.ruff_cache ./**/*/.ruff_cache ./**/.ruff_cache
+	-rm -rf ./.pytest_cache ./**/.pytest_cache ./**/*/.pytest_cache
 	-rm -rf ./.mypy_cache
 
 clean-env:
diff --git a/packages/api/README.md b/packages/api/README.md
index aa2b34690..2d68d67f8 100644
--- a/packages/api/README.md
+++ b/packages/api/README.md
@@ -27,6 +27,13 @@ make build-api LOCAL_VERSION=dev FLAVOR=upstream
 uds zarf package deploy packages/api/zarf-package-leapfrogai-api-*-dev.tar.zst --confirm
 ```
 
+For other package flavors, use the following example:
+
+```bash
+make build-api FLAVOR=registry1
+uds zarf package deploy packages/api/zarf-package-leapfrogai-api-*-dev.tar.zst --confirm
+```
+
 ### Local Development
 
 See the [source code documentation](../../src/leapfrogai_api/README.md) for running the API from the source code for local Python environment development.
diff --git a/packages/api/chart/templates/istio-admin.yaml b/packages/api/chart/templates/istio-admin.yaml
new file mode 100644
index 000000000..c369e8786
--- /dev/null
+++ b/packages/api/chart/templates/istio-admin.yaml
@@ -0,0 +1,24 @@
+{{- if .Capabilities.APIVersions.Has "security.istio.io/v1beta1" }}
+apiVersion: security.istio.io/v1beta1
+kind: AuthorizationPolicy
+metadata:
+  name: api-block-metrics-access-from-public-gateway
+  namespace: {{ .Release.Namespace }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "chart.selectorLabels" . | nindent 6 }}
+  action: DENY
+  rules:
+    - to:
+        - operation:
+            ports:
+              - "8080"
+            paths:
+            - /metrics*
+      from:
+        - source:
+            notNamespaces:
+            - istio-admin-gateway
+            - monitoring
+{{- end }}
diff --git a/packages/api/chart/templates/uds-package.yaml b/packages/api/chart/templates/uds-package.yaml
index a6a83dea8..17220788d 100644
--- a/packages/api/chart/templates/uds-package.yaml
+++ b/packages/api/chart/templates/uds-package.yaml
@@ -7,6 +7,11 @@ metadata:
   labels:
     {{- include "chart.labels" . | nindent 4 }}
 spec:
+  monitor:
+    - portName: http
+      targetPort: {{ .Values.api.service.port }}
+      selector:
+        {{- include "chart.selectorLabels" . | nindent 8 }}
   network:
     expose:
       - service: {{ include "chart.fullname" . }}
diff --git a/packages/api/chart/values.yaml b/packages/api/chart/values.yaml
index 65b397e46..4c217ba8a 100644
--- a/packages/api/chart/values.yaml
+++ b/packages/api/chart/values.yaml
@@ -25,6 +25,8 @@ api:
       value: "*.toml"
     - name: DEFAULT_EMBEDDINGS_MODEL
       value: "text-embeddings"
+    - name: DEV
+      value: "false"
     - name: PORT
       value: "8080"
     - name: SUPABASE_URL
diff --git a/packages/api/values/registry1-values.yaml b/packages/api/values/registry1-values.yaml
index d269c6415..91f92b168 100644
--- a/packages/api/values/registry1-values.yaml
+++ b/packages/api/values/registry1-values.yaml
@@ -1,9 +1,7 @@
 api:
   image:
     repository: "registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api"
-    # x-release-please-start-version
-    tag: v0.12.2
-    # x-release-please-end
+    tag: v###ZARF_CONST_IMAGE_VERSION###
 
   expose: "###ZARF_VAR_EXPOSE_API###"
 
@@ -16,6 +14,8 @@ api:
       value: "*.toml"
     - name: DEFAULT_EMBEDDINGS_MODEL
       value: "###ZARF_VAR_DEFAULT_EMBEDDINGS_MODEL###"
+    - name: DEV
+      value: "###ZARF_VAR_DEV###"
     - name: PORT
       value: "8080"
     - name: SUPABASE_URL
diff --git a/packages/api/values/upstream-values.yaml b/packages/api/values/upstream-values.yaml
index 6d867260e..ef2dcdad9 100644
--- a/packages/api/values/upstream-values.yaml
+++ b/packages/api/values/upstream-values.yaml
@@ -14,6 +14,8 @@ api:
       value: "*.toml"
     - name: DEFAULT_EMBEDDINGS_MODEL
       value: "###ZARF_VAR_DEFAULT_EMBEDDINGS_MODEL###"
+    - name: DEV
+      value: "###ZARF_VAR_DEV###"
     - name: PORT
       value: "8080"
     - name: SUPABASE_URL
diff --git a/packages/api/zarf-config.yaml b/packages/api/zarf-config.yaml
new file mode 100644
index 000000000..475ac2d48
--- /dev/null
+++ b/packages/api/zarf-config.yaml
@@ -0,0 +1,6 @@
+package:
+  create:
+    set:
+      # x-release-please-start-version
+      image_version: "0.13.1"
+      # x-release-please-end
diff --git a/packages/api/zarf.yaml b/packages/api/zarf.yaml
index 4fa6c59f2..51e0b5f38 100644
--- a/packages/api/zarf.yaml
+++ b/packages/api/zarf.yaml
@@ -16,6 +16,9 @@ variables:
     description: "Flag to expose the OpenAPI schema for debugging."
   - name: DEFAULT_EMBEDDINGS_MODEL
     default: "text-embeddings"
+  - name: DEV
+    default: "false"
+    description: "Flag to enable development endpoints."
 
 components:
   - name: leapfrogai-api
@@ -47,7 +50,7 @@ components:
         valuesFiles:
           - "values/registry1-values.yaml"
     images:
-      - "registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api:v0.12.2"
+      - "registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api:v###ZARF_PKG_TMPL_IMAGE_VERSION###"
       # TODO: replace with Ironbank image once hardened: registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api/migrations
       - "ghcr.io/defenseunicorns/leapfrogai/api-migrations:###ZARF_PKG_TMPL_IMAGE_VERSION###"
       - "registry1.dso.mil/ironbank/kiwigrid/k8s-sidecar:1.23.3"
diff --git a/packages/ui/chart/templates/ui/service.yaml b/packages/ui/chart/templates/ui/service.yaml
index 15243e806..2cb919567 100644
--- a/packages/ui/chart/templates/ui/service.yaml
+++ b/packages/ui/chart/templates/ui/service.yaml
@@ -18,11 +18,3 @@ spec:
       protocol: TCP
       port: {{ .Values.service.port }}
       targetPort: {{ .Values.service.port }}
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: {{ include "chart.serviceAccountName" . }}
-  namespace: {{ .Release.Namespace | default "leapfrogai" }}
-  labels:
-    {{- include "chart.labels" . | nindent 4 }}
diff --git a/packages/vllm/.env.example b/packages/vllm/.env.example
index 1e3a00170..0a995e234 100644
--- a/packages/vllm/.env.example
+++ b/packages/vllm/.env.example
@@ -1,13 +1,12 @@
-export LAI_HF_HUB_ENABLE_HF_TRANSFER="1"
-export LAI_REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"
-export LAI_REVISION="gptq-4bit-32g-actorder_True"
-export LAI_QUANTIZATION="gptq"
-export LAI_TENSOR_PARALLEL_SIZE=1
-export LAI_MODEL_SOURCE=".model/"
-export LAI_MAX_CONTEXT_LENGTH=32768
-export LAI_STOP_TOKENS='["</s>","<|endoftext|>","<|im_end|>"]'
-export LAI_PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_USER="USER: {}\n"
-export LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
-export LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
\ No newline at end of file
+LFAI_REPO_ID="TheBloke/SynthIA-7B-v2.0-GPTQ"
+LFAI_REVISION="gptq-4bit-32g-actorder_True"
+
+VLLM_TENSOR_PARALLEL_SIZE=1
+VLLM_TRUST_REMOTE_CODE=True
+VLLM_MAX_CONTEXT_LENGTH=32768
+VLLM_ENFORCE_EAGER=False
+VLLM_GPU_MEMORY_UTILIZATION=0.90
+VLLM_WORKER_USE_RAY=True
+VLLM_ENGINE_USE_RAY=True
+VLLM_QUANTIZATION=None
+VLLM_LOAD_FORMAT=auto
diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile
index 8676f5eda..f53088ead 100755
--- a/packages/vllm/Dockerfile
+++ b/packages/vllm/Dockerfile
@@ -6,8 +6,9 @@ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS builder
 # set SDK location
 # set the pyenv and Python versions
 ARG SDK_DEST=src/leapfrogai_sdk/build \
-    PYTHON_VERSION=3.11.6 \
-    PYENV_GIT_TAG=v2.4.8
+    PYTHON_VERSION=3.11.9 \
+    PYENV_GIT_TAG=v2.4.8\
+    COMPONENT_DIRECTORY="packages/vllm"
 
 # use root user for deps installation and nonroot user creation
 USER root
@@ -41,7 +42,7 @@ USER nonroot
 # copy-in SDK from sdk stage and vllm source code from host
 WORKDIR /home/leapfrogai
 COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
-COPY --chown=nonroot:nonroot packages/vllm packages/vllm
+COPY --chown=nonroot:nonroot ${COMPONENT_DIRECTORY} packages/vllm
 
 # create virtual environment for light-weight portability and minimal libraries
 RUN curl https://pyenv.run | bash && \
@@ -54,10 +55,10 @@ RUN curl https://pyenv.run | bash && \
 ENV PYENV_ROOT="/home/nonroot/.pyenv" \
     PATH="/home/nonroot/.pyenv/bin:$PATH"
 
-# Install Python 3.11.6, set it as global, and create a venv
+# Install Python, set it as global, and create a venv
 RUN . ~/.bashrc && \
-    PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.6 && \
-    pyenv global 3.11.6 && \
+    PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.9 && \
+    pyenv global ${PYTHON_VERSION} && \
     pyenv exec python -m venv .venv
 
 # set path to venv python
@@ -67,26 +68,15 @@ RUN rm -f packages/vllm/build/*.whl && \
     python -m pip wheel packages/vllm -w packages/vllm/build --find-links=${SDK_DEST} && \
     pip install packages/vllm/build/lfai_vllm*.whl --no-index --find-links=packages/vllm/build/
 
+#################
+# FINAL CONTAINER
+#################
+
 FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
 
 # set SDK location
 ARG SDK_DEST=src/leapfrogai_sdk/build
 
-# model-specific arguments
-ARG ARG HF_HUB_ENABLE_HF_TRANSFER="1" \
-    REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ" \
-    REVISION="gptq-4bit-32g-actorder_True" \
-    MODEL_SOURCE="/data/.model/" \
-    MAX_CONTEXT_LENGTH=32768 \
-    STOP_TOKENS='["</s>"]' \
-    PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n" \
-    PROMPT_FORMAT_CHAT_USER="USER: {}\n" \
-    PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n" \
-    PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \
-    PROMPT_FORMAT_DEFAULTS_TOP_K=0 \
-    TENSOR_PARALLEL_SIZE=1 \
-    QUANTIZATION="gptq"
-
 # setup nonroot user and permissions
 USER root
 RUN groupadd -g 65532 vglusers && \
@@ -101,24 +91,10 @@ COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/.venv /home/leapfrogai/.venv
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src /home/leapfrogai/packages/vllm/src
 # copy-in python binaries
-COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/
-
-# load ARG values into env variables for pickup by confz
-ENV LAI_HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER} \
-    LAI_REPO_ID=${REPO_ID} \
-    LAI_REVISION=${REVISION} \
-    LAI_MODEL_SOURCE=${MODEL_SOURCE} \
-    LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \
-    LAI_STOP_TOKENS=${STOP_TOKENS} \
-    LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \
-    LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \
-    LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \
-    LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \
-    LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \
-    LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \
-    LAI_QUANTIZATION=${QUANTIZATION} \
-    # remove vLLM callback to stats server
-    VLLM_NO_USAGE_STATS=1
+COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/ /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/
+
+# remove vLLM callback to stats server
+ENV VLLM_NO_USAGE_STATS=1
 
 ENV PATH="/home/leapfrogai/.venv/bin:$PATH"
 
diff --git a/packages/vllm/Makefile b/packages/vllm/Makefile
index 98e8b29db..c764a78f2 100644
--- a/packages/vllm/Makefile
+++ b/packages/vllm/Makefile
@@ -1,6 +1,27 @@
+ARCH ?= amd64
+LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
+DOCKER_FLAGS :=
+
 install:
 	python -m pip install ../../src/leapfrogai_sdk
 	python -m pip install -e ".[dev]"
 
-dev:
-	python -m leapfrogai_sdk.cli --app-dir=src/ main:Model
+download:
+	@env $$(cat .env | xargs) python src/model_download.py
+
+dev: download
+	@env $$(cat .env | xargs) python -m leapfrogai_sdk.cli --app-dir=src/ main:Model
+
+docker: download
+	docker build ${DOCKER_FLAGS} \
+		--platform=linux/${ARCH} \
+		--build-arg LOCAL_VERSION=${LOCAL_VERSION} \
+		--build-arg COMPONENT_DIRECTORY="./" \
+		-t ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} \
+		-f ./Dockerfile .
+
+	docker run -it --rm \
+		--env-file ./.env \
+		-v $(PWD)/config.yaml:/home/leapfrogai/config.yaml \
+		-v $(PWD)/.model:/home/leapfrogai/.model \
+		ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}
diff --git a/packages/vllm/README.md b/packages/vllm/README.md
index a55238cfd..5bc7a052f 100644
--- a/packages/vllm/README.md
+++ b/packages/vllm/README.md
@@ -16,13 +16,21 @@ See the LeapfrogAI documentation website for [system requirements](https://docs.
 
 The default model that comes with this backend in this repository's officially released images is a [4-bit quantization of the Synthia-7b model](https://huggingface.co/TheBloke/SynthIA-7B-v2.0-GPTQ).
 
-You can optionally specify different models or quantization types using the following Docker build arguments:
+All of the commands in this sub-section are executed within this `packages/vllm` sub-directory.
 
-- `--build-arg HF_HUB_ENABLE_HF_TRANSFER="1"`: Enable or disable HuggingFace Hub transfer (default: 1)
-- `--build-arg REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"`: HuggingFace repository ID for the model
-- `--build-arg REVISION="gptq-4bit-32g-actorder_True"`: Revision or commit hash for the model
-- `--build-arg QUANTIZATION="gptq"`: Quantization type (e.g., gptq, awq, or empty for un-quantized)
-- `--build-arg TENSOR_PARALLEL_SIZE="1"`: The number of gpus to spread the tensor processing across
+Optionally, you can specify a different model during Zarf creation:
+
+```bash
+uds zarf package create --confirm --set MODEL_REPO_ID=defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g --set MODEL_REVISION=main
+```
+
+If you decide to use a different model, there will likely be a need to change generation and engine runtime configurations, please see the [Zarf Package Config](./zarf-config.yaml) and the [values override file](./values/upstream-values.yaml) for details on what runtime parameters can be modified. These parameters are model-specific, and can be found in the HuggingFace model cards and/or configuration files (e.g., prompt templates).
+
+For example, during Zarf deployment, you can override the Zarf Package Config defaults by doing the following:
+
+```bash
+uds zarf package deploy zarf-package-vllm-amd64-dev.tar.zst --confirm --set ENFORCE_EAGER=True
+```
 
 ### Deployment
 
@@ -39,11 +47,26 @@ uds zarf package deploy packages/vllm/zarf-package-vllm-*-dev.tar.zst --confirm
 
 ### Local Development
 
-To run the vllm backend locally:
+In local development the [config.yaml](./config.yaml) and [.env.example](./.env.example) must be modified if the model has changed away from the default. The LeapfrogAI SDK picks up the `config.yaml` automatically, and the `.env` must be sourced into the Python environment.
 
 > [!IMPORTANT]
 > Execute the following commands from this sub-directory
 
+Create a `.env` file based on the [`.env.example`](./.env.example):
+
+```bash
+cp .env.example .env
+source .env
+```
+
+As necessary, modify the existing [`config.yaml`](./config.yaml):
+
+```bash
+vim config.yaml
+```
+
+To run the vllm backend locally:
+
 ```bash
 # Install dev and runtime dependencies
 make install
@@ -54,3 +77,19 @@ python src/model_download.py
 # Start the model backend
 make dev
 ```
+
+#### Local Docker Container
+
+To run the Docker container, use the following Makefile commands. `LOCAL_VERSION` must be consistent across the two Make commands.
+
+In the root of the LeapfrogAI repository:
+
+```bash
+LOCAL_VERSION=dev make sdk-wheel
+```
+
+In the root of this vLLM sub-directory:
+
+```bash
+LOCAL_VERSION=dev make docker
+```
diff --git a/packages/vllm/chart/templates/deployment.yaml b/packages/vllm/chart/templates/deployment.yaml
index 7b88cc137..3f8aa0540 100644
--- a/packages/vllm/chart/templates/deployment.yaml
+++ b/packages/vllm/chart/templates/deployment.yaml
@@ -36,7 +36,7 @@ spec:
             [
               "sh",
               "-c",
-              'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
+              'while [ ! -f ###ZARF_CONST_MODEL_PATH###/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
             ]
           resources:
             {{- toYaml .Values.modelInjectionContainer.resources | nindent 12 }}
@@ -46,6 +46,9 @@ spec:
         - name: leapfrogai-pv-storage
           persistentVolumeClaim:
             claimName: lfai-{{ .Values.nameOverride }}-pv-claim
+        - name: leapfrogai-sdk-configmap
+          configMap:
+            name: "{{ .Values.nameOverride }}-sdk-configmap"
       securityContext:
         {{- toYaml .Values.podSecurityContext | nindent 8 }}
       containers:
@@ -58,6 +61,9 @@ spec:
           env:
             {{- toYaml . | nindent 12 }}
           {{- end }}
+          envFrom:
+            - configMapRef:
+                name: "{{ .Values.nameOverride }}-engine-configmap"
           ports:
             - name: http
               containerPort: {{ .Values.service.port }}
@@ -67,6 +73,10 @@ spec:
           volumeMounts:
             - name: leapfrogai-pv-storage
               mountPath: "/data"
+            - name: leapfrogai-sdk-configmap
+              mountPath: "/home/leapfrogai/config.yaml"
+              subPath: "config.yaml"
+              readOnly: true
       {{- with .Values.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}
diff --git a/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml
new file mode 100644
index 000000000..cdc08be5e
--- /dev/null
+++ b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml
@@ -0,0 +1,37 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Values.nameOverride }}-sdk-configmap"
+  namespace: {{ .Release.Namespace | default "leapfrogai" }}
+data:
+  config.yaml: |
+    model:
+      source: {{ .Values.leapfrogaiConfig.model.source | quote }}
+    max_context_length: {{ .Values.leapfrogaiConfig.maxContextLength | quote }}
+    stop_tokens:
+      {{- $stopTokens := .Values.leapfrogaiConfig.stopTokens }}
+      {{- range $stopToken := splitList ", " .Values.leapfrogaiConfig.stopTokens }}
+      - {{ printf "%s" $stopToken }}
+      {{- end }}
+    prompt_format:
+    {{- with .Values.leapfrogaiConfig.promptFormat.chat }}
+      chat:
+      {{- if .system }}
+        system: {{ .system | quote }}
+      {{- end }}
+      {{- if .assistant }}
+        assistant: {{ .assistant | quote }}
+      {{- end }}
+      {{- if .user }}
+        user: {{ .user | quote }}
+      {{- end }}
+      {{- if .function }}
+        function: {{ .function | quote }}
+      {{- end }}
+    {{- end }}
+    defaults:
+      temperature: {{ .Values.leapfrogaiConfig.defaults.temperature | quote }}
+      top_p: {{ .Values.leapfrogaiConfig.defaults.topP | quote }}
+      top_k: {{ .Values.leapfrogaiConfig.defaults.topK | quote }}
+      repetition_penalty: {{ .Values.leapfrogaiConfig.defaults.repetitionPenalty | quote }}
+      max_new_tokens: {{ .Values.leapfrogaiConfig.defaults.maxNewTokens | quote }}
diff --git a/packages/vllm/chart/templates/vllm-engine-configmap.yaml b/packages/vllm/chart/templates/vllm-engine-configmap.yaml
new file mode 100644
index 000000000..5ac82b42c
--- /dev/null
+++ b/packages/vllm/chart/templates/vllm-engine-configmap.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Values.nameOverride }}-engine-configmap"
+  namespace: {{ .Release.Namespace | default "leapfrogai" }}
+data:
+  VLLM_TRUST_REMOTE_CODE: "{{ .Values.vllmConfig.trustRemoteCode }}"
+  VLLM_TENSOR_PARALLEL_SIZE: "{{ .Values.vllmConfig.tensorParallelSize }}"
+  VLLM_ENFORCE_EAGER: "{{ .Values.vllmConfig.enforceEager }}"
+  VLLM_GPU_MEMORY_UTILIZATION: "{{ .Values.vllmConfig.gpuMemoryUtilization }}"
+  VLLM_WORKER_USE_RAY: "{{ .Values.vllmConfig.workerUseRay }}"
+  VLLM_ENGINE_USE_RAY: "{{ .Values.vllmConfig.engineUseRay }}"
+  VLLM_QUANTIZATION: "{{ .Values.vllmConfig.quantization }}"
+  VLLM_LOAD_FORMAT: "{{ .Values.vllmConfig.loadFormat }}"
diff --git a/packages/vllm/chart/values.yaml b/packages/vllm/chart/values.yaml
index 0f7fe9911..0209a8b34 100644
--- a/packages/vllm/chart/values.yaml
+++ b/packages/vllm/chart/values.yaml
@@ -13,6 +13,33 @@ image:
 nameOverride: "vllm"
 fullnameOverride: ""
 
+leapfrogaiConfig:
+  model:
+    source: "/data/.model/"
+  maxContextLength: "32768"
+  stopTokens: "</s>, <|im_end|>, <|endoftext|>"
+  promptFormat:
+    chat:
+      system: "SYSTEM: {}\n"
+      assistant: "ASSISTANT: {}\n"
+      user: "USER: {}\n"
+  defaults:
+    temperature: "0.1"
+    topP: "1.0"
+    topK: "0"
+    repetitionPenalty: "1.0"
+    maxNewTokens: "8192"
+
+vllmConfig:
+  trustRemoteCode: "True"
+  tensorParallelSize: "1"
+  enforceEager: "False"
+  gpuMemoryUtilization: "0.90"
+  workerUseRay: "True"
+  engineUseRay: "True"
+  quantization: "None"
+  loadFormat: "auto"
+
 env:
   - name: LFAI_LOG_LEVEL
     value: "INFO"
@@ -41,7 +68,7 @@ resources:
   limits:
     cpu: 0
     memory: 0
-    nvidia.com/gpu: 0
+    nvidia.com/gpu: 1
   requests:
     cpu: 0
     memory: 0
diff --git a/packages/vllm/config.yaml b/packages/vllm/config.yaml
new file mode 100644
index 000000000..22210a74b
--- /dev/null
+++ b/packages/vllm/config.yaml
@@ -0,0 +1,17 @@
+model:
+  source: ".model/"
+max_context_length: 32768
+stop_tokens:
+  - "<|im_end|>"
+  - "<|endoftext|>"
+  - "</s>"
+prompt_format:
+  chat:
+    system: "SYSTEM: {}\n"
+    assistant: "ASSISTANT: {}\n"
+    user: "USER: {}\n"
+defaults:
+  top_p: 1.0
+  top_k: 0
+  repetition_penalty: 1.0
+  max_new_tokens: 8192
diff --git a/packages/vllm/pyproject.toml b/packages/vllm/pyproject.toml
index 4d7955708..24b1363e6 100644
--- a/packages/vllm/pyproject.toml
+++ b/packages/vllm/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.13.1"
 
 dependencies = [
     "pydantic == 2.8.2",
-    "vllm == 0.4.2",
+    "vllm == 0.4.3",
     "python-dotenv == 1.0.1",
     "aiostream ==0.6.2",
     "leapfrogai-sdk",
diff --git a/packages/vllm/src/config.py b/packages/vllm/src/config.py
index debca4ba3..c13af5521 100644
--- a/packages/vllm/src/config.py
+++ b/packages/vllm/src/config.py
@@ -5,10 +5,6 @@
 
 
 class ConfigOptions(BaseConfig):
-    quantization: Literal[None, "awq", "gptq", "squeezellm"] = Field(
-        default=None,
-        description="Type of quantization, for un-quantized models omit this field",
-    )
     tensor_parallel_size: int = Field(
         default=1,
         title="GPU Utilization Count",
@@ -16,39 +12,105 @@ class ConfigOptions(BaseConfig):
         "This must be divisible to the number of attention heads in the model",
         examples=[1, 2, 3],
     )
+    quantization: Literal[
+        "aqlm",
+        "bitsandbytes",
+        "awq",
+        "deepspeedfp",
+        "fp8",
+        "marlin",
+        "gptq_marlin_24",
+        "gptq_marlin",
+        "gptq",
+        "squeezellm",
+        "sparseml",
+        "None",
+        "",
+    ] = Field(
+        title="quantization",
+        description="Quantization type of the model"
+        "Force GPTQ instead of GPTQ_Marlin by explicitly providing `gptq` as value.",
+        examples=["awq", "fp8", "gptq_marlin", "gptq", "squeezellm", "None"],
+    )
+    load_format: Literal["auto", "safetensors", "npz", "pt", "bitsandbytes"] = Field(
+        title="quantization",
+        description="Load format for the type model and files",
+        examples=["auto", "safetensors", "npz", "pt", "bitsandbytes"],
+    )
+    enforce_eager: bool = Field(
+        title="Enable Eager Mode",
+        description="Enable eager mode to start token generation immediately after prompt processing."
+        "Potentially reduces initial latency at the cost of slightly higher memory usage."
+        "Should be set to False in production environments with higher GPU memory.",
+        examples=[True, False],
+    )
+    gpu_memory_utilization: float = Field(
+        title="GPU Memory Limit",
+        description="Maximum amount of GPU vRAM allocated to the vLLM engine and worker(s)",
+        examples=[0.50, 0.80, 0.90],
+    )
+    engine_use_ray: bool = Field(
+        title="Use Ray for Engine",
+        description="If True, uses Ray for managing the execution engine. Allows for distributed inferencing in multi-node situations.",
+        examples=[True, False],
+    )
+    worker_use_ray: bool = Field(
+        title="Use Ray for Worker",
+        description="If True, uses Ray for distributed worker management. Allows for distributed inferencing in multi-node situations.",
+        examples=[True, False],
+    )
+    trust_remote_code: bool = Field(
+        title="Trust Downloaded Model Code",
+        description="Whether to trust inferencing code downloaded as part of the model download."
+        "Please review the Python code in the .model/ directory before trusting custom model code.",
+        examples=[True, False],
+    )
 
 
 class DownloadOptions(BaseConfig):
-    hf_hub_enable_hf_transfer: Literal["0", "1"] = Field(
-        description="Option (0 - Disable, 1 - Enable) for faster transfers, tradeoff stability for faster speeds"
-    )
     repo_id: str = Field(
-        description="HuggingFace repo id",
+        description="The HuggingFace git repository ID",
         examples=[
-            "TheBloke/Synthia-7B-v2.0-GPTQ",
-            "migtissera/Synthia-MoE-v3-Mixtral-8x7B",
-            "microsoft/phi-2",
+            "defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g",
+            "justinthelaw/Phi-3-mini-128k-instruct-4bit-128g",
         ],
     )
     revision: str = Field(
-        description="The model branch to use",
+        description="The HuggingFace repository git branch to use",
         examples=["main", "gptq-4bit-64g-actorder_True"],
     )
 
 
+# vLLM specific runtime configuration options
 class AppConfig(BaseConfig):
     backend_options: ConfigOptions
+    CONFIG_SOURCES = [
+        EnvSource(
+            allow_all=True,
+            prefix="VLLM_",
+            remap={
+                "tensor_parallel_size": "backend_options.tensor_parallel_size",
+                "trust_remote_code": "backend_options.trust_remote_code",
+                "enforce_eager": "backend_options.enforce_eager",
+                "quantization": "backend_options.quantization",
+                "gpu_memory_utilization": "backend_options.gpu_memory_utilization",
+                "worker_use_ray": "backend_options.worker_use_ray",
+                "engine_use_ray": "backend_options.engine_use_ray",
+                "load_format": "backend_options.load_format",
+            },
+        )
+    ]
+
+
+class DownloadConfig(BaseConfig):
     download_options: Optional[DownloadOptions]
     CONFIG_SOURCES = [
         EnvSource(
             allow_all=True,
-            prefix="LAI_",
+            prefix="LFAI_",
             remap={
-                "hf_hub_enable_hf_transfer": "download_options.hf_hub_enable_hf_transfer",
                 "repo_id": "download_options.repo_id",
                 "revision": "download_options.revision",
-                "quantization": "backend_options.quantization",
-                "tensor_parallel_size": "backend_options.tensor_parallel_size",
             },
         )
     ]
diff --git a/packages/vllm/src/main.py b/packages/vllm/src/main.py
index 6a530e4f0..67d36d178 100644
--- a/packages/vllm/src/main.py
+++ b/packages/vllm/src/main.py
@@ -1,15 +1,12 @@
 import asyncio
-import json
 import logging
 import os
 import queue
 import random
-import sys
 import threading
 import time
 from typing import Any, Dict, AsyncGenerator
 
-from confz import EnvSource
 from dotenv import load_dotenv
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -18,15 +15,8 @@
 from vllm.utils import random_uuid
 
 from config import AppConfig
-from leapfrogai_sdk import (
-    BackendConfig,
-    ChatCompletionRequest,
-    CompletionRequest,
-)
-from leapfrogai_sdk.llm import (
-    GenerationConfig,
-    LLM,
-)
+from leapfrogai_sdk import BackendConfig
+from leapfrogai_sdk.llm import GenerationConfig, LLM
 
 load_dotenv()
 
@@ -84,60 +74,6 @@ def remove_iterator(self, async_iterable):
             pass  # If the iterable is not found, ignore the error
 
 
-def get_backend_configs():
-    # Manually load env var as ConfZ does not handle complex types (list)
-    stop_tokens: str | None = os.getenv("LAI_STOP_TOKENS")
-    if stop_tokens:
-        processed_stop_tokens = json.loads(stop_tokens)
-    else:
-        processed_stop_tokens = []
-    del os.environ["LAI_STOP_TOKENS"]
-
-    env_source = EnvSource(
-        allow_all=True,
-        prefix="LAI_",
-        remap={
-            "model_source": "model.source",
-            "max_context_length": "max_context_length",
-            "stop_tokens": "stop_tokens",
-            "prompt_format_chat_system": "prompt_format.chat.system",
-            "prompt_format_chat_assistant": "prompt_format.chat.assistant",
-            "prompt_format_chat_user": "prompt_format.chat.user",
-            "prompt_format_defaults_top_p": "prompt_format.defaults.top_p",
-            "prompt_format_defaults_top_k": "prompt_format.defaults.top_k",
-        },
-    )
-
-    BackendConfig.CONFIG_SOURCES = env_source
-    # Initialize an immutable config from env variables without stop_tokens list
-    backend_configs: BackendConfig = BackendConfig()
-    # Updates "processed_stop_tokens" without triggering Pydantic validation errors
-    backend_configs.model_copy(update={"stop_tokens": processed_stop_tokens})
-
-    return backend_configs
-
-
-def get_config_from_request(request: ChatCompletionRequest | CompletionRequest):
-    return GenerationConfig(
-        max_new_tokens=request.max_new_tokens,
-        temperature=request.temperature,
-        top_k=request.top_k,
-        top_p=request.top_p,
-        do_sample=request.do_sample,
-        n=request.n,
-        stop=list(request.stop),
-        repetition_penalty=request.repetition_penalty,
-        presence_penalty=request.presence_penalty,
-        best_of=str(request.best_of),
-        logit_bias=request.logit_bias,
-        return_full_text=request.return_full_text,
-        truncate=request.truncate,
-        typical_p=request.typical_p,
-        watermark=request.watermark,
-        seed=request.seed,
-    )
-
-
 @LLM
 class Model:
     """Implements an LLM model with concurrent output generation and management."""
@@ -152,19 +88,26 @@ def __init__(self):
         _thread = threading.Thread(target=asyncio.run, args=(self.iterate_outputs(),))
         _thread.start()
 
-        self.backend_config = get_backend_configs()
-        self.model = self.backend_config.model.source
+        quantization = (
+            None
+            if AppConfig().backend_options.quantization in ["", "None"]
+            else AppConfig().backend_options.quantization
+        )
+
         self.engine_args = AsyncEngineArgs(
-            engine_use_ray=True,
-            model=self.model,
-            trust_remote_code=False,
-            quantization=AppConfig().backend_options.quantization,
-            max_seq_len_to_capture=self.backend_config.max_context_length,
-            max_model_len=self.backend_config.max_context_length,
-            dtype="auto",
-            worker_use_ray=True,
-            gpu_memory_utilization=0.90,
+            # Taken from the LFAI SDK general LLM configuration
+            model=BackendConfig().model.source,
+            max_seq_len_to_capture=BackendConfig().max_context_length,
+            max_model_len=BackendConfig().max_context_length,
+            # Taken from the vLLM-specific configuration
+            enforce_eager=AppConfig().backend_options.enforce_eager,
+            quantization=quantization,
+            load_format=AppConfig().backend_options.load_format,
             tensor_parallel_size=AppConfig().backend_options.tensor_parallel_size,
+            engine_use_ray=AppConfig().backend_options.engine_use_ray,
+            worker_use_ray=AppConfig().backend_options.worker_use_ray,
+            gpu_memory_utilization=AppConfig().backend_options.gpu_memory_utilization,
+            trust_remote_code=AppConfig().backend_options.trust_remote_code,
         )
         self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
         print(self.engine_args)
@@ -228,18 +171,39 @@ async def create_response(
         """Initiate a response generation for the given prompt and configuration, adding the result to the iterator
         pool."""
 
-        sampling_params = SamplingParams(
-            temperature=config.temperature,
-            # Clamp top_p value to prevent float errors
-            top_p=clamp(config.top_p, 0.0 + sys.float_info.epsilon, 1.0),
-            # Restrict top_k to valid values, -1 disables top_k
-            top_k=config.top_k if config.top_k >= 1 else -1,
-            stop=self.backend_config.stop_tokens,
-            max_tokens=config.max_new_tokens,
-            skip_special_tokens=False,
-        )
+        # Collect LeapfrogAI SDK-defined parameters not aligned with vLLM SamplingParams
+        params = {
+            "max_tokens": getattr(config, "max_new_tokens"),
+        }
+
+        # Collect LeapfrogAI SDK-defined parameters directly aligned with vLLM SamplingParams
+        aligned_params = [
+            "temperature",
+            "top_p",
+            "top_k",
+            "stop",
+            "n",
+            "repetition_penalty",
+            "presence_penalty",
+            "best_of",
+            "logit_bias",
+            "return_full_text",
+            "truncate",
+            "typical_p",
+            "seed",
+        ]
+
+        # Add only the parameters that exist in the request
+        # vLLM will provide defaults for the rest, if not specified
+        for param in aligned_params:
+            if param in config:
+                params[param] = config[param]
+
+        # Pass the collected params to vLLM SamplingParams
+        sampling_params = SamplingParams(**params)
+
         logger.info(f"Begin generation for request {request_id}")
-        logger.debug(f"{request_id} sampling_paramms: {sampling_params}")
+        logger.debug(f"{request_id} sampling_params: {sampling_params}")
 
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
@@ -284,8 +248,12 @@ async def generate(
             request_id
         ):
             result = ""
-            if not self.is_queue_empty(request_id):
-                result = self.delta_queue_by_id.get(request_id).get()
+
+            # Ensure that the queue is not None and contains items before calling .get()
+            cur_queue = self.delta_queue_by_id.get(request_id)
+            if cur_queue is not None and not cur_queue.empty():
+                result = cur_queue.get()
+
             yield result
 
         logger.info(f"Finished request {request_id}")
diff --git a/packages/vllm/src/model_download.py b/packages/vllm/src/model_download.py
index 29f88942c..b87b6a61e 100644
--- a/packages/vllm/src/model_download.py
+++ b/packages/vllm/src/model_download.py
@@ -1,18 +1,17 @@
 import os
 from huggingface_hub import snapshot_download
-from config import AppConfig
+from config import DownloadConfig
 
-REPO_ID = AppConfig().download_options.repo_id
-REVISION = AppConfig().download_options.revision
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = (
-    AppConfig().download_options.hf_hub_enable_hf_transfer
-)
+REPO_ID = DownloadConfig().download_options.repo_id
+REVISION = DownloadConfig().download_options.revision
+
+# enable hf_transfer to max-out model download bandwidth
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 print(f"Downloading model from {REPO_ID} at revision {REVISION}...")
 
 snapshot_download(
     repo_id=REPO_ID,
     local_dir=".model",
-    local_dir_use_symlinks=False,
     revision=REVISION,
 )
diff --git a/packages/vllm/values/upstream-values.yaml b/packages/vllm/values/upstream-values.yaml
index 0fe581bdd..e74ebec4a 100644
--- a/packages/vllm/values/upstream-values.yaml
+++ b/packages/vllm/values/upstream-values.yaml
@@ -2,12 +2,55 @@ image:
   repository: "ghcr.io/defenseunicorns/leapfrogai/vllm"
   tag: "###ZARF_CONST_IMAGE_VERSION###"
 
+nameOverride: "###ZARF_CONST_NAME_OVERRIDE###"
+
+leapfrogaiConfig:
+  model:
+    source: "###ZARF_CONST_MODEL_PATH###"
+  maxContextLength: "###ZARF_VAR_MAX_CONTEXT_LENGTH###"
+  stopTokens: "###ZARF_VAR_STOP_TOKENS###"
+  promptFormat:
+    chat:
+      system: "###ZARF_VAR_PROMPT_FORMAT_CHAT_SYSTEM###"
+      assistant: "###ZARF_VAR_PROMPT_FORMAT_CHAT_ASSISTANT###"
+      user: "###ZARF_VAR_PROMPT_FORMAT_CHAT_USER###"
+  defaults:
+    temperature: "###ZARF_VAR_TEMPERATURE###"
+    topP: "###ZARF_VAR_TOP_P###"
+    topK: "###ZARF_VAR_TOP_K###"
+    repetitionPenalty: "###ZARF_VAR_REPETITION_PENALTY###"
+    maxNewTokens: "###ZARF_VAR_MAX_NEW_TOKENS###"
+
+
+vllmConfig:
+  trustRemoteCode: "###ZARF_VAR_TRUST_REMOTE_CODE###"
+  tensorParallelSize: "###ZARF_VAR_TENSOR_PARALLEL_SIZE###"
+  enforceEager: "###ZARF_VAR_ENFORCE_EAGER###"
+  gpuMemoryUtilization: "###ZARF_VAR_GPU_MEMORY_UTILIZATION###"
+  workerUseRay: "###ZARF_VAR_WORKER_USE_RAY###"
+  engineUseRay: "###ZARF_VAR_ENGINE_USE_RAY###"
+  quantization: "###ZARF_VAR_QUANTIZATION###"
+  loadFormat: "###ZARF_VAR_LOAD_FORMAT###"
+
+env:
+  - name: LFAI_LOG_LEVEL
+    value: "INFO"
+
 gpu:
   runtimeClassName: "###ZARF_VAR_GPU_RUNTIME###"
 
 resources:
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
   limits:
+    cpu: 0
+    memory: 0
     nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###"
+  requests:
+    cpu: 0
+    memory: 0
 
 persistence:
   size: "###ZARF_VAR_PVC_SIZE###"
diff --git a/packages/vllm/zarf-config.yaml b/packages/vllm/zarf-config.yaml
new file mode 100644
index 000000000..5f032eecb
--- /dev/null
+++ b/packages/vllm/zarf-config.yaml
@@ -0,0 +1,39 @@
+package:
+  create:
+    set:
+      # x-release-please-start-version
+      image_version: "0.13.0"
+      # x-release-please-end
+
+      model_repo_id: "TheBloke/Synthia-7B-v2.0-GPTQ"
+      model_revision: "gptq-4bit-32g-actorder_True"
+      model_path: "/data/.model/"
+      name_override: "vllm"
+  deploy:
+    set:
+      # vLLM runtime configuration (usually influenced by .env in local development)
+      trust_remote_code: "True"
+      tensor_parallel_size: "1"
+      enforce_eager: "False"
+      gpu_memory_utilization: "0.90"
+      worker_use_ray: "True"
+      engine_use_ray: "True"
+      quantization: "None"
+      load_format: "auto"
+      # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+      max_context_length: "32768"
+      stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+      prompt_format_chat_system: "SYSTEM: {}\n"
+      prompt_format_chat_user: "USER: {}\n"
+      prompt_format_chat_assistant: "ASSISTANT: {}\n"
+      temperature: "0.1"
+      top_p: "1.0"
+      top_k: "0"
+      repetition_penalty: "1.0"
+      max_new_tokens: "8192"
+      # Pod deployment configuration
+      gpu_limit: "1"
+      gpu_runtime: "nvidia"
+      pvc_size: "15Gi"
+      pvc_access_mode: "ReadWriteOnce"
+      pvc_storage_class: "local-path"
diff --git a/packages/vllm/zarf.yaml b/packages/vllm/zarf.yaml
index ed88c2f18..5e1733d17 100644
--- a/packages/vllm/zarf.yaml
+++ b/packages/vllm/zarf.yaml
@@ -9,27 +9,86 @@ metadata:
 constants:
   - name: IMAGE_VERSION
     value: "###ZARF_PKG_TMPL_IMAGE_VERSION###"
+  - name: MODEL_REPO_ID
+    description: "The HuggingFace repository ID"
+    value: "###ZARF_PKG_TMPL_MODEL_REPO_ID###"
+  - name: MODEL_REVISION
+    description: "The HuggingFace git branch or commit hash"
+    value: "###ZARF_PKG_TMPL_MODEL_REVISION###"
+  - name: MODEL_PATH
+    description: "Defines the location of the Zarf Injected model files in the vLLM container"
+    value: "###ZARF_PKG_TMPL_MODEL_PATH###"
+  - name: NAME_OVERRIDE
+    description: "Provide an override for the name of the deployment (e.g., the model name)"
+    value: "###ZARF_PKG_TMPL_NAME_OVERRIDE###"
 
 variables:
+  # vLLM runtime configuration (usually influenced by .env in local development)
+  - name: TRUST_REMOTE_CODE
+    description: "If True, allows the execution of code within the model files directory"
+    pattern: "^(True|False)$"
+  - name: TENSOR_PARALLEL_SIZE
+    description: "The number of tensor parallelism splits, typically used for model parallelism across GPUs"
+    pattern: "^[1-9][0-9]*$"
+  - name: ENFORCE_EAGER
+    description: "If set to True, enforces eager execution mode instead of lazy execution, impacting performance"
+    pattern: "^(True|False)$"
+  - name: GPU_MEMORY_UTILIZATION
+    description: "The fraction of GPU memory to be utilized, expressed as a decimal value between 0.01 and 0.99"
+    pattern: ^0\.(0[1-9]|[1-9][0-9])$
+  - name: WORKER_USE_RAY
+    description: "If True, uses Ray for distributed worker management"
+    pattern: "^(True|False)$"
+  - name: ENGINE_USE_RAY
+    description: "If True, uses Ray for managing the execution engine"
+    pattern: "^(True|False)$"
+  - name: QUANTIZATION
+    description: "If None, allows vLLM to automatically detect via model files and configuration"
+  - name: LOAD_FORMAT
+    description: "If auto, allows vLLM to automatically detect via model files and configuration"
+  # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+  - name: MAX_CONTEXT_LENGTH
+    description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used"
+    pattern: "^[1-9][0-9]*$"
+  - name: STOP_TOKENS
+    description: "A set of special tokens that signal the model to stop producing further output, delimited using a comma and space"
+    pattern: ^(<[^,]+>\s*,\s*)*<[^,]+>\s*$
+  - name: PROMPT_FORMAT_CHAT_SYSTEM
+    description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
+  - name: PROMPT_FORMAT_CHAT_USER
+    description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
+  - name: PROMPT_FORMAT_CHAT_ASSISTANT
+    description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
+  - name: TEMPERATURE
+    description: "Controls the randomness of the model's output"
+    pattern: ^(0(\.\d+)?|1(\.0+)?)$
+  - name: TOP_P
+    description: "The cumulative probability threshold for token sampling, where 1.0 represents no restriction"
+    pattern: ^(0(\.\d+)?|1(\.0+)?)$
+  - name: TOP_K
+    description: "The number of top-K tokens to consider during sampling, where 0 disables top-K sampling"
+    pattern: ^\d+$
+  - name: REPETITION_PENALTY
+    description: "The penalty value for repetition in generation"
+    pattern: ^(0(\.\d+)?|1(\.0+)?)$
+  - name: MAX_NEW_TOKENS
+    description: "Maximum new tokens to generate"
+    pattern: ^\d+$
+  # Pod deployment configuration
   - name: GPU_LIMIT
-    description: The GPU limit for the model inferencing. Must be 1 or more.
-    default: "1"
+    description: "The GPU limit for the model inferencing. Must be 1 or more."
     pattern: "^[1-9][0-9]*$"
   - name: GPU_RUNTIME
-    description: The GPU runtime name for the model inferencing.
-    default: "nvidia"
+    description: "The GPU runtime name for the model inferencing."
     pattern: "^(nvidia)?$"
   - name: PVC_SIZE
-    description: Size of the PVC used for model storage.
-    default: "15Gi"
+    description: "Size of the PVC used for model storage."
     pattern: "^[0-9]+[a-zA-Z]+$"
   - name: PVC_ACCESS_MODE
-    description: Access mode of the PVC used for model storage.
-    default: "ReadWriteOnce"
+    description: "Access mode of the PVC used for model storage."
     pattern: "^(ReadWriteOnce|ReadOnlyMany|ReadWriteMany)$"
   - name: PVC_STORAGE_CLASS
-    description: Storage class of the PVC used for model storage.
-    default: "local-path"
+    description: "Storage class of the PVC used for model storage."
 
 components:
   - name: vllm-model
@@ -37,33 +96,33 @@ components:
     only:
       flavor: upstream
     charts:
-      - name: vllm-model
+      - name: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model"
         namespace: leapfrogai
         localPath: chart
-        releaseName: vllm-model
+        releaseName: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model"
         # x-release-please-start-version
         version: 0.13.1
         # x-release-please-end
         valuesFiles:
           - "values/upstream-values.yaml"
     images:
-      - ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###
-      - cgr.dev/chainguard/bash:latest
+      - "ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###"
+      - "cgr.dev/chainguard/bash:latest"
     dataInjections:
-      - source: .model/
+      # location where locally downloaded model files are located
+      - source: ".model/"
         target:
-          namespace: leapfrogai
-          selector: app=lfai-vllm
-          container: data-loader
-          path: /data/.model
+          namespace: "leapfrogai"
+          selector: "app=lfai-###ZARF_PKG_TMPL_NAME_OVERRIDE###"
+          container: "data-loader"
+          # location in the container for injection of the model files
+          path: "###ZARF_PKG_TMPL_MODEL_PATH###"
         compress: true
     actions:
       onCreate:
         before:
           # NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed
-          - cmd: python src/model_download.py
+          - cmd: "python src/model_download.py"
             env:
-              - LAI_REPO_ID=TheBloke/Synthia-7B-v2.0-GPTQ
-              - LAI_REVISION=gptq-4bit-32g-actorder_True
-              - LAI_QUANTIZATION=gptq
-              - LAI_HF_HUB_ENABLE_HF_TRANSFER=1
+              - LFAI_REPO_ID=###ZARF_PKG_TMPL_MODEL_REPO_ID###
+              - LFAI_REVISION=###ZARF_PKG_TMPL_MODEL_REVISION###
diff --git a/packages/whisper/Dockerfile b/packages/whisper/Dockerfile
index b3bed054a..a5513e9fa 100644
--- a/packages/whisper/Dockerfile
+++ b/packages/whisper/Dockerfile
@@ -37,8 +37,8 @@ COPY --from=builder /leapfrogai/.venv/ /leapfrogai/.venv/
 
 # set the path to the cuda 11.8 dependencies
 ENV LD_LIBRARY_PATH \
-    /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\
-    /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
+/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\
+/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
 
 COPY packages/whisper/main.py .
 
diff --git a/src/leapfrogai_api/README.md b/src/leapfrogai_api/README.md
index eec4dd0c6..214c986a9 100644
--- a/src/leapfrogai_api/README.md
+++ b/src/leapfrogai_api/README.md
@@ -56,3 +56,72 @@ See the ["Access" section of the DEVELOPMENT.md](../../docs/DEVELOPMENT.md#acces
 ### Tests
 
 See the [tests directory documentation](../../tests/README.md) for more details.
+
+### Reranking Configuration
+
+The LeapfrogAI API includes a Retrieval Augmented Generation (RAG) pipeline for enhanced question answering. This section details how to configure its reranking options. All RAG configurations are managed through the `/leapfrogai/v1/rag/configure` API endpoint.
+
+#### 1. Enabling/Disabling Reranking
+
+Reranking improves the accuracy and relevance of RAG responses. You can enable or disable it using the `enable_reranking` parameter:
+
+* **Enable Reranking:** Send a PATCH request to `/leapfrogai/v1/rag/configure` with the following JSON payload:
+
+```json
+{
+  "enable_reranking": true
+}
+```
+
+* **Disable Reranking:**  Send a PATCH request with:
+
+```json
+{
+  "enable_reranking": false
+}
+```
+
+#### 2. Selecting a Reranking Model
+
+Multiple reranking models are supported, each offering different performance characteristics.  Choose your preferred model using the `ranking_model` parameter.  Ensure you've installed any necessary Python dependencies for your chosen model (see the [rerankers library documentation](https://github.com/AnswerDotAI/rerankers) on dependencies).
+
+* **Supported Models:**  The system supports several models, including (but not limited to) `flashrank`, `rankllm`, `cross-encoder`, and `colbert`.  Refer to the [rerankers library documentation](https://github.com/AnswerDotAI/rerankers) for a complete list and details on their capabilities.
+
+* **Model Selection:** Use a PATCH request to `/leapfrogai/v1/rag/configure` with the desired model:
+
+```json
+{
+  "enable_reranking": true,  // Reranking must be enabled
+  "ranking_model": "rankllm" // Or another supported model
+}
+```
+
+#### 3. Adjusting the Number of Results Before Reranking (`rag_top_k_when_reranking`)
+
+This parameter sets the number of top results retrieved from the vector database *before* the reranking process begins. A higher value increases the diversity of candidates considered for reranking but also increases processing time. A lower value can lead to missing relevant results if not carefully chosen. This setting is only relevant when reranking is enabled.
+
+* **Configuration:** Use a PATCH request to `/leapfrogai/v1/rag/configure` to set this value:
+
+```json
+{
+  "enable_reranking": true,
+  "ranking_model": "flashrank",
+  "rag_top_k_when_reranking": 150 // Adjust this value as needed
+}
+```
+
+#### 4. Retrieving the Current RAG Configuration
+
+To check the current RAG configuration (including reranking status, model, and `rag_top_k_when_reranking`), send a GET request to `/leapfrogai/v1/rag/configure`. The response will be a JSON object containing all the current settings.
+
+#### 5.  Example Configuration Flow
+
+1. **Initial Setup:**  Start with reranking enabled using the default `flashrank` model and a `rag_top_k_when_reranking` value of 100.
+
+2. **Experiment with Models:**  Test different reranking models (`rankllm`, `colbert`, etc.) by changing the `ranking_model` parameter and observing the impact on response quality.  Adjust `rag_top_k_when_reranking` as needed to find the optimal balance between diversity and performance.
+
+3. **Fine-tuning:** Once you identify a suitable model, fine-tune the `rag_top_k_when_reranking` parameter for optimal performance.  Monitor response times and quality to determine the best setting.
+
+4. **Disabling Reranking:** If needed, disable reranking by setting `"enable_reranking": false`.
+
+Remember to always consult the [rerankers library documentation](https://github.com/AnswerDotAI/rerankers) for information on supported models and their specific requirements.  The API documentation provides further details on request formats and potential error responses.
diff --git a/src/leapfrogai_api/backend/grpc_client.py b/src/leapfrogai_api/backend/grpc_client.py
index f9082fdc2..9d18d2951 100644
--- a/src/leapfrogai_api/backend/grpc_client.py
+++ b/src/leapfrogai_api/backend/grpc_client.py
@@ -63,7 +63,7 @@ async def completion(model: Model, request: lfai.CompletionRequest):
                 CompletionChoice(
                     index=0,
                     text=response.choices[0].text,
-                    finish_reason=finish_reason_enum.to_string(),
+                    finish_reason=finish_reason_enum.to_finish_reason(),
                     logprobs=None,
                 )
             ],
@@ -122,7 +122,7 @@ async def chat_completion(model: Model, request: lfai.ChatCompletionRequest):
                         ).lower(),
                         content=response.choices[0].chat_item.content,
                     ),
-                    finish_reason=finish_reason_enum.to_string(),
+                    finish_reason=finish_reason_enum.to_finish_reason(),
                 )
             ],
             usage=Usage(
diff --git a/src/leapfrogai_api/backend/helpers.py b/src/leapfrogai_api/backend/helpers.py
index 65a2fd0b5..005111601 100644
--- a/src/leapfrogai_api/backend/helpers.py
+++ b/src/leapfrogai_api/backend/helpers.py
@@ -39,7 +39,7 @@ async def recv_completion(
                         index=0,
                         text=c.choices[0].text,
                         logprobs=None,
-                        finish_reason=finish_reason_enum.to_string(),
+                        finish_reason=finish_reason_enum.to_finish_reason(),
                     )
                 ],
                 usage=Usage(
@@ -77,7 +77,7 @@ async def recv_chat(
                         delta=ChatDelta(
                             role="assistant", content=c.choices[0].chat_item.content
                         ),
-                        finish_reason=finish_reason_enum.to_string(),
+                        finish_reason=finish_reason_enum.to_finish_reason(),
                     )
                 ],
                 usage=Usage(
diff --git a/src/leapfrogai_api/backend/rag/query.py b/src/leapfrogai_api/backend/rag/query.py
index e5e0decce..bd0ae9bf6 100644
--- a/src/leapfrogai_api/backend/rag/query.py
+++ b/src/leapfrogai_api/backend/rag/query.py
@@ -1,11 +1,15 @@
 """Service for querying the RAG model."""
 
+from rerankers.results import RankedResults
 from supabase import AClient as AsyncClient
 from langchain_core.embeddings import Embeddings
 from leapfrogai_api.backend.rag.leapfrogai_embeddings import LeapfrogAIEmbeddings
 from leapfrogai_api.data.crud_vector_content import CRUDVectorContent
-from leapfrogai_api.typedef.vectorstores.search_types import SearchResponse
+from leapfrogai_api.typedef.rag.rag_types import ConfigurationSingleton
+from leapfrogai_api.typedef.vectorstores.search_types import SearchResponse, SearchItem
 from leapfrogai_api.backend.constants import TOP_K
+from leapfrogai_api.utils.logging_tools import logger
+from rerankers import Reranker
 
 # Allows for overwriting type of embeddings that will be instantiated
 embeddings_type: type[Embeddings] | type[LeapfrogAIEmbeddings] | None = (
@@ -22,7 +26,10 @@ def __init__(self, db: AsyncClient) -> None:
         self.embeddings = embeddings_type()
 
     async def query_rag(
-        self, query: str, vector_store_id: str, k: int = TOP_K
+        self,
+        query: str,
+        vector_store_id: str,
+        k: int = TOP_K,
     ) -> SearchResponse:
         """
         Query the Vector Store.
@@ -36,11 +43,70 @@ async def query_rag(
             SearchResponse: The search response from the vector store.
         """
 
+        logger.debug("Beginning RAG query...")
+
         # 1. Embed query
         vector = await self.embeddings.aembed_query(query)
 
         # 2. Perform similarity search
+        _k: int = k
+        if ConfigurationSingleton.get_instance().enable_reranking:
+            """Use the user specified top-k value unless reranking.
+            When reranking, use the reranking top-k value to get the initial results.
+            Then filter the list down later to just the k that the user has requested after reranking."""
+            _k = ConfigurationSingleton.get_instance().rag_top_k_when_reranking
+
         crud_vector_content = CRUDVectorContent(db=self.db)
-        return await crud_vector_content.similarity_search(
-            query=vector, vector_store_id=vector_store_id, k=k
+        results = await crud_vector_content.similarity_search(
+            query=vector, vector_store_id=vector_store_id, k=_k
         )
+
+        # 3. Rerank results
+        if (
+            ConfigurationSingleton.get_instance().enable_reranking
+            and len(results.data) > 0
+        ):
+            ranker = Reranker(ConfigurationSingleton.get_instance().ranking_model)
+            ranked_results: RankedResults = ranker.rank(
+                query=query,
+                docs=[result.content for result in results.data],
+                doc_ids=[result.id for result in results.data],
+            )
+            results = rerank_search_response(results, ranked_results)
+            # Narrow down the results to the top-k value specified by the user
+            results.data = results.data[0:k]
+
+        logger.debug("Ending RAG query...")
+
+        return results
+
+
+def rerank_search_response(
+    original_response: SearchResponse, ranked_results: RankedResults
+) -> SearchResponse:
+    """
+    Reorder the SearchResponse based on reranked results.
+
+    Args:
+        original_response (SearchResponse): The original search response.
+        ranked_results (List[str]): List of ranked content strings.
+
+    Returns:
+        SearchResponse: A new SearchResponse with reordered items.
+    """
+    # Create a mapping of id to original SearchItem
+    content_to_item = {item.id: item for item in original_response.data}
+
+    # Create new SearchItems based on reranked results
+    ranked_items = []
+    for content in ranked_results.results:
+        if content.document.doc_id in content_to_item:
+            item: SearchItem = content_to_item[content.document.doc_id]
+            item.rank = content.rank
+            item.score = content.score
+            ranked_items.append(item)
+
+    ranked_response = SearchResponse(data=ranked_items)
+
+    # Create a new SearchResponse with reranked items
+    return ranked_response
diff --git a/src/leapfrogai_api/main.py b/src/leapfrogai_api/main.py
index 85822f7f3..108ccd51e 100644
--- a/src/leapfrogai_api/main.py
+++ b/src/leapfrogai_api/main.py
@@ -8,12 +8,13 @@
 from fastapi import FastAPI
 from fastapi.exception_handlers import request_validation_exception_handler
 from fastapi.exceptions import RequestValidationError
-
+from fastapi.responses import RedirectResponse
 from leapfrogai_api.routers.base import router as base_router
 from leapfrogai_api.routers.leapfrogai import auth
 from leapfrogai_api.routers.leapfrogai import models as lfai_models
 from leapfrogai_api.routers.leapfrogai import vector_stores as lfai_vector_stores
 from leapfrogai_api.routers.leapfrogai import count as lfai_token_count
+from leapfrogai_api.routers.leapfrogai import rag as lfai_rag
 from leapfrogai_api.routers.openai import (
     assistants,
     audio,
@@ -29,6 +30,7 @@
     vector_stores,
 )
 from leapfrogai_api.utils import get_model_config
+from prometheus_fastapi_instrumentator import Instrumentator
 
 logging.basicConfig(
     level=os.getenv("LFAI_LOG_LEVEL", logging.INFO),
@@ -61,6 +63,21 @@ async def lifespan(app: FastAPI):
 app = FastAPI(lifespan=lifespan)
 
 
+@app.get("/", include_in_schema=False)
+async def root():
+    """Intercepts the root path and redirects to the API documentation."""
+    return RedirectResponse(url="/docs")
+
+
+Instrumentator(
+    excluded_handlers=["/healthz", "/metrics"],
+    should_group_status_codes=False,
+).instrument(app).expose(
+    app,
+    include_in_schema=False,
+)
+
+
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request, exc):
     logger.error(f"The client sent invalid data!: {exc}")
@@ -81,6 +98,8 @@ async def validation_exception_handler(request, exc):
 app.include_router(messages.router)
 app.include_router(runs_steps.router)
 app.include_router(lfai_vector_stores.router)
+if os.environ.get("DEV"):
+    app.include_router(lfai_rag.router)
 app.include_router(lfai_token_count.router)
 app.include_router(lfai_models.router)
 # This should be at the bottom to prevent it preempting more specific runs endpoints
diff --git a/src/leapfrogai_api/pyproject.toml b/src/leapfrogai_api/pyproject.toml
index a18f6422f..4542f7922 100644
--- a/src/leapfrogai_api/pyproject.toml
+++ b/src/leapfrogai_api/pyproject.toml
@@ -26,6 +26,8 @@ dependencies = [
     "postgrest==0.16.11",                    # required by supabase, bug when using previous versions
     "openpyxl == 3.1.5",
     "psutil == 6.0.0",
+    "prometheus-fastapi-instrumentator == 7.0.0",
+    "rerankers[flashrank] == 0.5.3"
 ]
 requires-python = "~=3.11"
 
diff --git a/src/leapfrogai_api/routers/leapfrogai/rag.py b/src/leapfrogai_api/routers/leapfrogai/rag.py
new file mode 100644
index 000000000..3b61b616e
--- /dev/null
+++ b/src/leapfrogai_api/routers/leapfrogai/rag.py
@@ -0,0 +1,56 @@
+"""LeapfrogAI endpoints for RAG."""
+
+from fastapi import APIRouter
+from leapfrogai_api.typedef.rag.rag_types import (
+    ConfigurationSingleton,
+    ConfigurationPayload,
+)
+from leapfrogai_api.routers.supabase_session import Session
+from leapfrogai_api.utils.logging_tools import logger
+
+router = APIRouter(prefix="/leapfrogai/v1/rag", tags=["leapfrogai/rag"])
+
+
+@router.patch("/configure")
+async def configure(session: Session, configuration: ConfigurationPayload) -> None:
+    """
+    Configures the RAG settings at runtime.
+
+    Args:
+        session (Session): The database session.
+        configuration (Configuration): The configuration to update.
+    """
+
+    # We set the class variable to update the configuration globally
+    ConfigurationSingleton._instance = ConfigurationSingleton.get_instance().copy(
+        update=configuration.dict(exclude_none=True)
+    )
+
+
+@router.get("/configure")
+async def get_configuration(session: Session) -> ConfigurationPayload:
+    """
+    Retrieves the current RAG configuration.
+
+    Args:
+        session (Session): The database session.
+
+    Returns:
+        Configuration: The current RAG configuration.
+    """
+
+    instance = ConfigurationSingleton.get_instance()
+
+    # Create a new dictionary with only the relevant attributes
+    config_dict = {
+        key: value
+        for key, value in instance.__dict__.items()
+        if not key.startswith("_")  # Exclude private attributes
+    }
+
+    # Create a new ConfigurationPayload instance with the filtered dictionary
+    new_configuration = ConfigurationPayload(**config_dict)
+
+    logger.info(f"The current configuration has been set to {new_configuration}")
+
+    return new_configuration
diff --git a/src/leapfrogai_api/routers/leapfrogai/vector_stores.py b/src/leapfrogai_api/routers/leapfrogai/vector_stores.py
index 09f8f4a77..5251440c1 100644
--- a/src/leapfrogai_api/routers/leapfrogai/vector_stores.py
+++ b/src/leapfrogai_api/routers/leapfrogai/vector_stores.py
@@ -33,9 +33,7 @@ async def search(
     """
     query_service = QueryService(db=session)
     return await query_service.query_rag(
-        query=query,
-        vector_store_id=vector_store_id,
-        k=k,
+        query=query, vector_store_id=vector_store_id, k=k
     )
 
 
diff --git a/src/leapfrogai_api/typedef/completion/completion_types.py b/src/leapfrogai_api/typedef/completion/completion_types.py
index 9a5cdad95..f92d91f28 100644
--- a/src/leapfrogai_api/typedef/completion/completion_types.py
+++ b/src/leapfrogai_api/typedef/completion/completion_types.py
@@ -7,15 +7,48 @@
 
 
 class FinishReason(Enum):
-    NONE = 0  # Maps to "None"
-    STOP = 1  # Maps to "stop"
-    LENGTH = 2  # Maps to "length"
+    NONE = 0
+    STOP = 1
+    LENGTH = 2
 
-    def to_string(self) -> str | None:
+    def to_finish_reason(self) -> str | None:
+        """
+        Convert the enum member to its corresponding finish reason string.
+
+        Returns:
+            str | None: The finish reason as a lowercase string if it is not NONE; otherwise, None.
+        """
         if self == FinishReason.NONE:
             return None
         return self.name.lower()
 
+    @classmethod
+    def _missing_(cls, value):
+        """
+        Handle missing values when creating an enum instance.
+
+        This method is called when a value passed to the enum constructor does not match any existing enum members.
+        It provides custom logic to map input values to enum members or raises an error if the value is invalid.
+
+        Args:
+            value: The value that was not found among the enum members.
+
+        Returns:
+            FinishReason: The corresponding enum member after applying custom mapping.
+
+        Raises:
+            ValueError: If the value cannot be mapped to any enum member.
+        """
+        # Handle custom value mappings
+        if value is None or value == "None":
+            return cls.NONE
+        elif value == "stop":
+            return cls.STOP
+        elif value == "length":
+            return cls.LENGTH
+        else:
+            raise ValueError(f"Invalid FinishReason value: {value}")
+
 
 class CompletionChoice(BaseModel):
     """Choice object for completion."""
diff --git a/src/leapfrogai_api/typedef/rag/__init__.py b/src/leapfrogai_api/typedef/rag/__init__.py
new file mode 100644
index 000000000..65c2e26cd
--- /dev/null
+++ b/src/leapfrogai_api/typedef/rag/__init__.py
@@ -0,0 +1,3 @@
+from .rag_types import (
+    ConfigurationSingleton as ConfigurationSingleton,
+)
diff --git a/src/leapfrogai_api/typedef/rag/rag_types.py b/src/leapfrogai_api/typedef/rag/rag_types.py
new file mode 100644
index 000000000..17fe6601c
--- /dev/null
+++ b/src/leapfrogai_api/typedef/rag/rag_types.py
@@ -0,0 +1,40 @@
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class ConfigurationSingleton:
+    """Singleton manager for ConfigurationPayload."""
+
+    _instance = None
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = ConfigurationPayload()
+            cls._instance.enable_reranking = True
+            cls._instance.rag_top_k_when_reranking = 100
+            cls._instance.ranking_model = "flashrank"
+        return cls._instance
+
+
+class ConfigurationPayload(BaseModel):
+    """Response for RAG configuration."""
+
+    enable_reranking: Optional[bool] = Field(
+        default=None,
+        examples=[True, False],
+        description="Enables reranking for RAG queries",
+    )
+    # More model info can be found here:
+    # https://github.com/AnswerDotAI/rerankers?tab=readme-ov-file
+    # https://pypi.org/project/rerankers/
+    ranking_model: Optional[str] = Field(
+        default=None,
+        description="What model to use for reranking. Some options may require additional python dependencies.",
+        examples=["flashrank", "rankllm", "cross-encoder", "colbert"],
+    )
+    rag_top_k_when_reranking: Optional[int] = Field(
+        default=None,
+        description="The top-k results returned from the RAG call before reranking",
+    )
diff --git a/src/leapfrogai_api/typedef/vectorstores/search_types.py b/src/leapfrogai_api/typedef/vectorstores/search_types.py
index d8d2a2d13..ea69df1fe 100644
--- a/src/leapfrogai_api/typedef/vectorstores/search_types.py
+++ b/src/leapfrogai_api/typedef/vectorstores/search_types.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from pydantic import BaseModel, Field
 
 
@@ -25,6 +27,14 @@ class SearchItem(BaseModel):
     similarity: float = Field(
         ..., description="Similarity score of this item to the query."
     )
+    rank: Optional[int] = Field(
+        default=None,
+        description="The rank of this search item after ranking has occurred.",
+    )
+    score: Optional[float] = Field(
+        default=None,
+        description="The score of this search item after ranking has occurred.",
+    )
 
 
 class SearchResponse(BaseModel):
diff --git a/src/leapfrogai_api/utils/logging_tools.py b/src/leapfrogai_api/utils/logging_tools.py
new file mode 100644
index 000000000..aa2448288
--- /dev/null
+++ b/src/leapfrogai_api/utils/logging_tools.py
@@ -0,0 +1,12 @@
+import os
+import logging
+from dotenv import load_dotenv
+
+load_dotenv()
+
+logging.basicConfig(
+    level=os.getenv("LFAI_LOG_LEVEL", logging.INFO),
+    format="%(name)s: %(asctime)s | %(levelname)s | %(filename)s:%(lineno)s >>> %(message)s",
+)
+
+logger = logging.getLogger(__name__)
diff --git a/src/leapfrogai_evals/pyproject.toml b/src/leapfrogai_evals/pyproject.toml
index 1974da81a..9726c51c0 100644
--- a/src/leapfrogai_evals/pyproject.toml
+++ b/src/leapfrogai_evals/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.13.1"
 
 dependencies = [
     "deepeval == 1.3.0",
-    "openai == 1.42.0",
+    "openai == 1.45.0",
     "tqdm == 4.66.5",
     "python-dotenv == 1.0.1",
     "seaborn == 0.13.2",
@@ -16,7 +16,8 @@ dependencies = [
     "huggingface-hub == 0.24.6",
     "anthropic ==0.34.2",
     "instructor ==1.4.3",
-    "pyPDF2 == 3.0.1"
+    "pyPDF2 == 3.0.1",
+    "python-dotenv == 1.0.1"
 ]
 requires-python = "~=3.11"
 readme = "README.md"
diff --git a/tasks.yaml b/tasks.yaml
new file mode 100644
index 000000000..2298757ba
--- /dev/null
+++ b/tasks.yaml
@@ -0,0 +1,133 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.14.0/tasks.schema.json
+
+includes:
+  - badge: https://raw.githubusercontent.com/defenseunicorns/uds-common/82e63be82766a2e550a847af904b2d738c9d3478/tasks/badge.yaml
+
+tasks:
+  - name: nightly-uds-badge-verification
+    description: "Runs in a pipeline and produces a report for archiving"
+    actions:
+      - description: "Create Reports Directory"
+        cmd: |
+          mkdir -p reports
+      - description: "Run UDS Badge Verification Task"
+        cmd: |
+          uds run verify-uds-badge-cpu --no-progress 2>&1 | tee ./reports/intermediate-report.txt
+      - description: "Clean Up Final Report"
+        cmd: |
+          python3 .github/scripts/uds_verification_report.py | tee ./reports/final-report.txt
+
+  #############
+  # BADGE TASKS
+  #############
+  - name: verify-uds-badge-cpu
+    description: "Runs through all CPU UDS bundle packages with the UDS badge verification test"
+    actions:
+      - task: verify-uds-badge-api
+      - task: verify-uds-badge-ui
+      - task: verify-uds-badge-llama-cpp-python
+      - task: verify-uds-badge-text-embeddings
+      - task: verify-uds-badge-whisper
+      - task: verify-uds-badge-supabase
+
+  - name: verify-uds-badge-gpu
+    description: "Runs through all GPU UDS bundle packages with the UDS badge verification test"
+    actions:
+      - task: verify-uds-badge-api
+      - task: verify-uds-badge-ui
+      - task: verify-uds-badge-vllm
+      - task: verify-uds-badge-text-embeddings
+      - task: verify-uds-badge-whisper
+      - task: verify-uds-badge-supabase
+
+  #######################
+  # RE-USABLE BADGE TASKS
+  #######################
+
+  - name: verify-uds-badge-api
+    actions:
+      - description: "Verify API"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="true" \
+            --set PACKAGE_DIR="packages/api" \
+            --no-progress
+
+  - name: verify-uds-badge-ui
+    actions:
+      - description: "Verify UI"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="false" \
+            --set PACKAGE_DIR="packages/ui" \
+            --no-progress
+
+  - name: verify-uds-badge-llama-cpp-python
+    actions:
+      - description: "Verify LLaMA-CPP-Python"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="false" \
+            --set PACKAGE_DIR="packages/llama-cpp-python" \
+            --no-progress
+
+  - name: verify-uds-badge-vllm
+    actions:
+      - description: "Verify vLLM"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="false" \
+            --set PACKAGE_DIR="packages/vllm" \
+            --no-progress
+
+  - name: verify-uds-badge-text-embeddings
+    actions:
+      - description: "Verify Text-Embeddings"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="false" \
+            --set PACKAGE_DIR="packages/text-embeddings" \
+            --no-progress
+
+  - name: verify-uds-badge-whisper
+    actions:
+      - description: "Verify Whisper"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="false" \
+            --set PACKAGE_DIR="packages/whisper" \
+            --no-progress
+
+  - name: verify-uds-badge-repeater
+    actions:
+      - description: "Verify Repeater"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="false" \
+            --set PACKAGE_DIR="packages/repeater" \
+            --no-progress
+
+  - name: verify-uds-badge-supabase
+    actions:
+      - description: "Verify Supabase"
+        cmd: |
+          uds run badge:verify-badge \
+            --set CHART_PATH="chart" \
+            --set GROUP_NAME="package" \
+            --set COMMON_ZARF="false" \
+            --set PACKAGE_DIR="packages/supabase" \
+            --no-progress
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
index 4f498b102..580034011 100644
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -1,12 +1,14 @@
+from openai import OpenAI
 import pytest
 
-from openai import OpenAI
+from tests.utils.client import leapfrogai_client, get_leapfrogai_model
 
-from .utils import create_test_user
+
+@pytest.fixture(scope="module")
+def client() -> OpenAI:
+    return leapfrogai_client()
 
 
 @pytest.fixture(scope="module")
-def client():
-    return OpenAI(
-        base_url="https://leapfrogai-api.uds.dev/openai/v1", api_key=create_test_user()
-    )
+def model_name() -> str:
+    return get_leapfrogai_model()
diff --git a/tests/e2e/test_api.py b/tests/e2e/test_api.py
index b556954e0..44e533645 100644
--- a/tests/e2e/test_api.py
+++ b/tests/e2e/test_api.py
@@ -5,7 +5,7 @@
 import pytest as pytest
 import requests
 
-from .utils import create_test_user
+from tests.utils.client import create_test_user
 
 logger = logging.getLogger(__name__)
 test_id = str(uuid.uuid4())
diff --git a/tests/e2e/test_llm_generation.py b/tests/e2e/test_llm_generation.py
index badb0dd3e..cb309d597 100644
--- a/tests/e2e/test_llm_generation.py
+++ b/tests/e2e/test_llm_generation.py
@@ -1,41 +1,28 @@
-import os
 from typing import Iterable
-import warnings
 
 import pytest
 from openai import InternalServerError, OpenAI
 from openai.types.chat import ChatCompletionMessageParam
 from tests.utils.data_path import data_path, WAV_FILE
 
-DEFAULT_LEAPFROGAI_MODEL = "llama-cpp-python"
-
-
-def get_model_name():
-    model_name = os.getenv("LEAPFROGAI_MODEL")
-    if model_name is None:
-        warnings.warn(
-            f"LEAPFROGAI_MODEL environment variable not set. Defaulting to '{DEFAULT_LEAPFROGAI_MODEL}'.\n"
-            "Consider setting LEAPFROGAI_MODEL explicitly. Examples: 'vllm', 'repeater', 'llama-cpp-python'."
-        )
-        model_name = DEFAULT_LEAPFROGAI_MODEL
-    return model_name
-
-
-@pytest.fixture
-def model_name():
-    return get_model_name()
+# Test generation parameters
+SYSTEM_PROMPT = "You are a helpful assistant."
+USER_PROMPT = "Only return 1 word"
+MAX_TOKENS = 128
+TEMPERATURE = 0
 
 
 def test_chat_completions(client: OpenAI, model_name: str):
     messages: Iterable[ChatCompletionMessageParam] = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "What is your name?"},
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": USER_PROMPT},
     ]
 
     chat_completion = client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_tokens=128,
+        max_tokens=MAX_TOKENS,
+        temperature=TEMPERATURE,
     )
     assert chat_completion.model == model_name
     assert len(chat_completion.choices) == 1
@@ -51,8 +38,9 @@ def test_chat_completions(client: OpenAI, model_name: str):
 def test_completions(client: OpenAI, model_name: str):
     completion = client.completions.create(
         model=model_name,
-        prompt="Only return 1 word",
-        max_tokens=128,
+        prompt=USER_PROMPT,
+        max_tokens=MAX_TOKENS,
+        temperature=TEMPERATURE,
     )
     assert completion.model == model_name
     assert len(completion.choices) == 1
diff --git a/tests/e2e/test_supabase.py b/tests/e2e/test_supabase.py
index 1e98f2ec4..c9302c6be 100644
--- a/tests/e2e/test_supabase.py
+++ b/tests/e2e/test_supabase.py
@@ -17,7 +17,7 @@
 
 from leapfrogai_api.data.crud_vector_store_file import CRUDVectorStoreFile
 
-from .utils import ANON_KEY, create_test_user, SERVICE_KEY
+from tests.utils.client import ANON_KEY, create_test_user, SERVICE_KEY
 from openai.types import FileObject
 
 health_urls = {
diff --git a/tests/e2e/test_text_backend_full.py b/tests/e2e/test_text_backend_full.py
index fdee17172..d1f28bcf4 100644
--- a/tests/e2e/test_text_backend_full.py
+++ b/tests/e2e/test_text_backend_full.py
@@ -21,7 +21,7 @@ def download_arxiv_pdf():
         )
 
 
-def test_run_with_background_task(client: OpenAI):
+def test_run_with_background_task(client: OpenAI, model_name: str):
     """
     This test confirms whether a vector store for an assistant can index files
     while chatting at the same time.
@@ -52,7 +52,7 @@ def test_run_with_background_task(client: OpenAI):
 
     # Create an assistant
     assistant = client.beta.assistants.create(
-        model="llama-cpp-python",
+        model=model_name,
         name="Test Assistant",
         instructions="You are a helpful assistant with access to a knowledge base about AI and machine learning.",
         tools=[{"type": "file_search"}],
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
deleted file mode 100644
index 32eb8daff..000000000
--- a/tests/e2e/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import json
-import logging
-import os
-import traceback
-import pytest
-import requests
-
-# This is the anon_key for supabase, it provides access to the endpoints that would otherwise be inaccessible
-ANON_KEY = os.environ["ANON_KEY"]
-SERVICE_KEY = os.environ["SERVICE_KEY"]
-DEFAULT_TEST_EMAIL = "fakeuser1@test.com"
-DEFAULT_TEST_PASSWORD = "password"
-
-
-def create_test_user(
-    anon_key: str = ANON_KEY,
-    email: str = DEFAULT_TEST_EMAIL,
-    password: str = DEFAULT_TEST_PASSWORD,
-) -> str:
-    headers = {
-        "apikey": f"{anon_key}",
-        "Authorization": f"Bearer {anon_key}",
-        "Content-Type": "application/json",
-    }
-
-    try:
-        requests.post(
-            url="https://supabase-kong.uds.dev/auth/v1/signup",
-            headers=headers,
-            json={
-                "email": email,
-                "password": password,
-                "confirmPassword": password,
-            },
-        )
-    except Exception:
-        logging.error(
-            "Error creating user (likely because the user already exists): %s",
-            traceback.format_exc(),
-        )
-
-    return get_jwt_token(anon_key, email, password)
-
-
-def get_jwt_token(
-    api_key: str,
-    test_email: str = DEFAULT_TEST_EMAIL,
-    test_password: str = DEFAULT_TEST_PASSWORD,
-) -> str:
-    url = "https://supabase-kong.uds.dev/auth/v1/token?grant_type=password"
-    headers = {"apikey": f"{api_key}", "Content-Type": "application/json"}
-    data = {"email": test_email, "password": test_password}
-
-    response = requests.post(url, headers=headers, json=data)
-    if response.status_code != 200:
-        pytest.fail(
-            f"Request for the JWT token failed with status code {response.status_code} expected 200",
-            False,
-        )
-
-    return json.loads(response.content)["access_token"]
diff --git a/tests/integration/api/test_rag_files.py b/tests/integration/api/test_rag_files.py
index 45f832418..7520ddbcc 100644
--- a/tests/integration/api/test_rag_files.py
+++ b/tests/integration/api/test_rag_files.py
@@ -1,9 +1,13 @@
 import os
+from typing import Optional
+
+import requests
 from openai.types.beta.threads.text import Text
 import pytest
 from tests.utils.data_path import data_path
 
-from tests.utils.client import client_config_factory
+from leapfrogai_api.typedef.rag.rag_types import ConfigurationPayload
+from tests.utils.client import client_config_factory, get_leapfrogai_api_url_base
 
 
 def make_test_assistant(client, model, vector_store_id):
@@ -77,3 +81,66 @@ def test_rag_needle_haystack():
 
     for a in message_content.annotations:
         print(a.text)
+
+
+def configure_rag(
+    enable_reranking: bool,
+    ranking_model: str,
+    rag_top_k_when_reranking: int,
+):
+    """
+    Configures the RAG settings.
+
+    Args:
+        enable_reranking: Whether to enable reranking.
+        ranking_model: The ranking model to use.
+        rag_top_k_when_reranking: The top-k results to return before reranking.
+    """
+    url = f"{get_leapfrogai_api_url_base()}/leapfrogai/v1/rag/configure"
+    configuration = ConfigurationPayload(
+        enable_reranking=enable_reranking,
+        ranking_model=ranking_model,
+        rag_top_k_when_reranking=rag_top_k_when_reranking,
+    )
+
+    try:
+        response = requests.patch(url, json=configuration.model_dump())
+        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+        print("RAG configuration updated successfully.")
+    except requests.exceptions.RequestException as e:
+        print(f"Error configuring RAG: {e}")
+
+
+def get_rag_configuration() -> Optional[ConfigurationPayload]:
+    """
+    Retrieves the current RAG configuration.
+
+    Args:
+        base_url: The base URL of the API.
+
+    Returns:
+        The RAG configuration, or None if there was an error.
+    """
+    url = f"{get_leapfrogai_api_url_base()}/leapfrogai/v1/rag/configure"
+
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        config = ConfigurationPayload.model_validate_json(response.text)
+        print(f"Current RAG configuration: {config}")
+        return config
+    except requests.exceptions.RequestException as e:
+        print(f"Error getting RAG configuration: {e}")
+        return None
+
+
+@pytest.mark.skipif(
+    os.environ.get("LFAI_RUN_NIAH_TESTS") != "true",
+    reason="LFAI_RUN_NIAH_TESTS envvar was not set to true",
+)
+def test_rag_needle_haystack_with_reranking():
+    configure_rag(True, "flashrank", 100)
+    config_result = get_rag_configuration()
+    assert config_result is not None
+    assert config_result.enable_reranking is True
+    test_rag_needle_haystack()
diff --git a/tests/integration/api/test_vector_stores.py b/tests/integration/api/test_vector_stores.py
index 5427a0943..9a3be72a4 100644
--- a/tests/integration/api/test_vector_stores.py
+++ b/tests/integration/api/test_vector_stores.py
@@ -1,7 +1,6 @@
 """Test the API endpoints for assistants."""
 
 import json
-import os
 import time
 
 import pytest
@@ -19,6 +18,7 @@
 )
 from leapfrogai_api.routers.openai.vector_stores import router as vector_store_router
 from leapfrogai_api.routers.openai.files import router as files_router
+from tests.utils.client import create_test_user
 from tests.utils.data_path import data_path, TXT_FILE
 
 INSTRUCTOR_XL_EMBEDDING_SIZE: int = 768
@@ -37,11 +37,11 @@ class MissingEnvironmentVariable(Exception):
 headers: dict[str, str] = {}
 
 try:
-    headers = {"Authorization": f"Bearer {os.environ['SUPABASE_USER_JWT']}"}
+    headers = {"Authorization": f"Bearer {create_test_user()}"}
 except KeyError as exc:
     raise MissingEnvironmentVariable(
         "SUPABASE_USER_JWT must be defined for the test to pass. "
-        "Please check the api README for instructions on obtaining this token."
+        "Please check the packages/api and src/leapfrogai_api READMEs for instructions on obtaining this token."
     ) from exc
 
 vector_store_client = TestClient(vector_store_router, headers=headers)
diff --git a/tests/pytest/leapfrogai_api/test_api.py b/tests/pytest/leapfrogai_api/test_api.py
index 724b0dc58..ec6460fda 100644
--- a/tests/pytest/leapfrogai_api/test_api.py
+++ b/tests/pytest/leapfrogai_api/test_api.py
@@ -32,6 +32,7 @@
 )
 TEXT_INPUT_LEN = len(TEXT_INPUT)
 
+
 #########################
 #########################
 
@@ -147,6 +148,7 @@ def test_routes():
         "/openai/v1/files": ["POST"],
         "/openai/v1/assistants": ["POST"],
         "/leapfrogai/v1/count/tokens": ["POST"],
+        "/leapfrogai/v1/rag/configure": ["GET", "PATCH"],
     }
 
     openai_routes = [
@@ -196,10 +198,14 @@ def test_routes():
     ]
 
     actual_routes = app.routes
-    for route in actual_routes:
-        if hasattr(route, "path") and route.path in expected_routes:
-            assert route.methods == set(expected_routes[route.path])
-            del expected_routes[route.path]
+    for expected_route in expected_routes:
+        matching_routes = {expected_route: []}
+        for actual_route in actual_routes:
+            if hasattr(actual_route, "path") and expected_route == actual_route.path:
+                matching_routes[actual_route.path].extend(actual_route.methods)
+        assert set(expected_routes[expected_route]) <= set(
+            matching_routes[expected_route]
+        )
 
     for route, name, methods in openai_routes:
         found = False
@@ -214,8 +220,6 @@ def test_routes():
                 break
         assert found, f"Missing route: {route}, {name}, {methods}"
 
-    assert len(expected_routes) == 0
-
 
 def test_healthz():
     """Test the healthz endpoint."""
@@ -535,3 +539,55 @@ def test_token_count(dummy_auth_middleware):
         assert "token_count" in response_data
         assert isinstance(response_data["token_count"], int)
         assert response_data["token_count"] == len(input_text)
+
+
+@pytest.mark.skipif(
+    os.environ.get("LFAI_RUN_REPEATER_TESTS") != "true"
+    or os.environ.get("DEV") != "true",
+    reason="LFAI_RUN_REPEATER_TESTS envvar was not set to true",
+)
+def test_configure(dummy_auth_middleware):
+    """Test the RAG configuration endpoints."""
+    with TestClient(app) as client:
+        rag_configuration_request = {
+            "enable_reranking": True,
+            "ranking_model": "rankllm",
+            "rag_top_k_when_reranking": 50,
+        }
+        response = client.patch(
+            "/leapfrogai/v1/rag/configure", json=rag_configuration_request
+        )
+        assert response.status_code == 200
+
+        response = client.get("/leapfrogai/v1/rag/configure")
+        assert response.status_code == 200
+        response_data = response.json()
+        assert "enable_reranking" in response_data
+        assert "ranking_model" in response_data
+        assert "rag_top_k_when_reranking" in response_data
+        assert isinstance(response_data["enable_reranking"], bool)
+        assert isinstance(response_data["ranking_model"], str)
+        assert isinstance(response_data["rag_top_k_when_reranking"], int)
+        assert response_data["enable_reranking"] is True
+        assert response_data["ranking_model"] == "rankllm"
+        assert response_data["rag_top_k_when_reranking"] == 50
+
+        # Update only some of the configs to see if the existing ones persist
+        rag_configuration_request = {"ranking_model": "flashrank"}
+        response = client.patch(
+            "/leapfrogai/v1/rag/configure", json=rag_configuration_request
+        )
+        assert response.status_code == 200
+
+        response = client.get("/leapfrogai/v1/rag/configure")
+        assert response.status_code == 200
+        response_data = response.json()
+        assert "enable_reranking" in response_data
+        assert "ranking_model" in response_data
+        assert "rag_top_k_when_reranking" in response_data
+        assert isinstance(response_data["enable_reranking"], bool)
+        assert isinstance(response_data["ranking_model"], str)
+        assert isinstance(response_data["rag_top_k_when_reranking"], int)
+        assert response_data["enable_reranking"] is True
+        assert response_data["ranking_model"] == "flashrank"
+        assert response_data["rag_top_k_when_reranking"] == 50
diff --git a/tests/utils/client.py b/tests/utils/client.py
index 6fe598514..0016f8c4c 100644
--- a/tests/utils/client.py
+++ b/tests/utils/client.py
@@ -1,8 +1,113 @@
+import json
+import logging
+import traceback
 from urllib.parse import urljoin
 from openai import OpenAI
 import os
+import pytest
 import requests
 from requests import Response
+from fastapi import status
+
+ANON_KEY = os.environ["ANON_KEY"]
+SERVICE_KEY = os.environ["SERVICE_KEY"]
+DEFAULT_TEST_EMAIL = "test-user@test.com"
+DEFAULT_TEST_PASSWORD = "password"
+
+
+def get_supabase_url() -> str:
+    """Get the URL for Supabase.
+
+    Returns:
+        str: The URL for Supabase. (default: "https://supabase-kong.uds.dev")
+    """
+
+    return os.getenv("SUPABASE_URL", "https://supabase-kong.uds.dev")
+
+
+def create_test_user(
+    anon_key: str = ANON_KEY,
+    email: str = DEFAULT_TEST_EMAIL,
+    password: str = DEFAULT_TEST_PASSWORD,
+) -> str:
+    """
+    Create a test user in the authentication system.
+
+    This function attempts to create a new user with the given email and password using the specified
+    anonymous API key. If the user already exists, the error is logged. It returns the JWT token
+    for the created or existing user.
+
+    Args:
+        anon_key (str): The anonymous API key for authentication service.
+        email (str): The email address of the test user. Default is "fakeuser1@test.com".
+        password (str): The password for the test user. Default is "password".
+
+    Returns:
+        str: The JWT token for the created or existing user.
+    """
+    supabase_base_url = get_supabase_url()
+
+    headers = {
+        "apikey": f"{anon_key}",
+        "Authorization": f"Bearer {anon_key}",
+        "Content-Type": "application/json",
+    }
+
+    try:
+        requests.post(
+            url=f"{supabase_base_url}/auth/v1/signup",
+            headers=headers,
+            json={
+                "email": email,
+                "password": password,
+                "confirmPassword": password,
+            },
+        )
+    except Exception:
+        logging.error(
+            "Error creating user (likely because the user already exists): %s",
+            traceback.format_exc(),
+        )
+
+    return get_jwt_token(supabase_base_url, anon_key, email, password)
+
+
+def get_jwt_token(
+    supabase_base_url: str,
+    api_key: str,
+    test_email: str = DEFAULT_TEST_EMAIL,
+    test_password: str = DEFAULT_TEST_PASSWORD,
+) -> str:
+    """
+    Retrieve a JWT token for a test user using email and password.
+
+    This function sends a request to the authentication service to obtain a JWT token using
+    the provided API key, email, and password.
+
+    Args:
+        api_key (str): The API key for the authentication service.
+        test_email (str): The email address of the test user. Default is "fakeuser1@test.com".
+        test_password (str): The password for the test user. Default is "password".
+
+    Returns:
+        str: The JWT access token for the authenticated user.
+
+    Raises:
+        AssertionError: If the request fails or the response status code is not 200.
+    """
+
+    url = f"{supabase_base_url}/auth/v1/token?grant_type=password"
+    headers = {"apikey": f"{api_key}", "Content-Type": "application/json"}
+    data = {"email": test_email, "password": test_password}
+
+    response = requests.post(url, headers=headers, json=data)
+    if response.status_code != status.HTTP_200_OK:
+        pytest.fail(
+            f"Request for the JWT token failed with status code {response.status_code} expected 200",
+            False,
+        )
+
+    return json.loads(response.content)["access_token"]
 
 
 def get_leapfrogai_model() -> str:
@@ -12,7 +117,15 @@ def get_leapfrogai_model() -> str:
         str: The model to use for LeapfrogAI. (default: "vllm")
     """
 
-    return os.getenv("LEAPFROGAI_MODEL", "vllm")
+    model = os.getenv("LEAPFROGAI_MODEL")
+
+    if not model:
+        model = "vllm"
+        logging.warning(
+            f"LEAPFROGAI_MODEL is not set, using default model of `{model}`"
+        )
+
+    return model
 
 
 def get_openai_key() -> str:
@@ -49,14 +162,18 @@ def get_leapfrogai_api_key() -> str:
 
     Returns:
         str: The API key for the LeapfrogAI API.
+
     Raises:
         ValueError: If LEAPFROGAI_API_KEY or SUPABASE_USER_JWT is not set.
     """
 
     api_key = os.getenv("LEAPFROGAI_API_KEY") or os.getenv("SUPABASE_USER_JWT")
 
-    if api_key is None:
-        raise ValueError("LEAPFROGAI_API_KEY or SUPABASE_USER_JWT not set")
+    if not api_key:
+        logging.warning(
+            "LEAPFROGAI_API_KEY or SUPABASE_USER_JWT not set, automatically generating test user."
+        )
+        return create_test_user()
 
     return api_key
 
@@ -74,9 +191,9 @@ def get_leapfrogai_api_url() -> str:
 def get_leapfrogai_api_url_base() -> str:
     """Get the base URL for the LeapfrogAI API.
 
-    Set via the LEAPFRAGAI_API_URL environment variable.
+    Set via the LEAPFROGAI_API_URL environment variable.
 
-    If LEAPFRAGAI_API_URL is set to "https://leapfrogai-api.uds.dev/openai/v1", this will trim off the "/openai/v1" part.
+    If LEAPFROGAI_API_URL is set to "https://leapfrogai-api.uds.dev/openai/v1", this will trim off the "/openai/v1" part.
 
     Returns:
         str: The base URL for the LeapfrogAI API. (default: "https://leapfrogai-api.uds.dev")