diff --git a/.github/actions/release/action.yaml b/.github/actions/release/action.yaml index 63afd9b1d..38157f59e 100644 --- a/.github/actions/release/action.yaml +++ b/.github/actions/release/action.yaml @@ -138,7 +138,7 @@ runs: run: | docker buildx build --build-arg LOCAL_VERSION=${{ inputs.releaseTag }} -t ghcr.io/defenseunicorns/leapfrogai/vllm:${{ inputs.releaseTag }} --push -f packages/vllm/Dockerfile . - zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm + ZARF_CONFIG=packages/vllm/zarf-config.yaml zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm zarf package publish zarf-package-vllm-amd64-${{ inputs.releaseTag }}.tar.zst oci://ghcr.io/defenseunicorns/packages${{ inputs.subRepository }}leapfrogai diff --git a/.github/release-please-config.json b/.github/release-please-config.json index 6b03844af..8f475204d 100644 --- a/.github/release-please-config.json +++ b/.github/release-please-config.json @@ -11,7 +11,6 @@ "versioning": "default", "extra-files": [ "pyproject.toml", - ".github/workflows/nightly-snapshot-release.yaml", { "type": "generic", "path": "**/Chart.yaml", @@ -27,6 +26,11 @@ "path": "**/zarf.yaml", "glob": true }, + { + "type": "generic", + "path": "**/zarf-config.yaml", + "glob": true + }, { "type": "generic", "path": "**/uds-bundle.yaml", diff --git a/.github/scripts/uds_verification_report.py b/.github/scripts/uds_verification_report.py new file mode 100755 index 000000000..0e4d4e8fe --- /dev/null +++ b/.github/scripts/uds_verification_report.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +import os +import re + + +def remove_ansi_escape_sequences(text): + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + return ansi_escape.sub("", text) + + +# Capabilities that affect the entire capability, not just a single package +def uds_capability_wide_errors(text: str) -> bool: + if "Not all pods have the istio sidecar" in text: + return True + return False + + +# CI environment variable enables GitHub annotations +def print_package_info( + package_name, + failures_count, + errors_count, + warnings_count, + failure_descriptions, + error_descriptions, + warning_descriptions, + uds_capability_wide_errors_count, +): + if uds_capability_wide_errors_count >= 1: + errors_count -= uds_capability_wide_errors_count + if package_name: + print("-----------------------------") + if os.getenv("CI") == "true": + print(f"::group::{package_name}") + print(f"Package: {package_name}\n") + if failures_count > 0: + if os.getenv("CI") == "true": + print("::error::", end="") + print(f"⛔ Failures: {failures_count}") + else: + if errors_count > 0: + if os.getenv("CI") == "true": + print("::error::", end="") + print(f"❌ Errors: {errors_count}") + if warnings_count > 0: + if os.getenv("CI") == "true": + print("::warning::", end="") + print(f"⚠️ Warnings: {warnings_count}") + if failures_count > 0: + print("\n⛔ Failure Descriptions:") + for desc in failure_descriptions: + print(f" - {desc}") + else: + if errors_count > 0: + print("\n❌ Error Descriptions:") + for desc in error_descriptions: + print(f" - {desc}") + if warnings_count > 0: + print("\n⚠️ Warning Descriptions:") + for desc in warning_descriptions: + print(f" - {desc}") + if os.getenv("CI") == "true": + print("::endgroup::") + + +def main(): + # Read data from the specified file instead of stdin + file_path = os.path.join( + os.getenv("GITHUB_WORKSPACE", ""), "reports/intermediate-report.txt" + ) + with open(file_path, mode="r", encoding="utf-8", errors="ignore") as file: + data = file.read() + # Remove ANSI escape sequences + clean_data = remove_ansi_escape_sequences(data) + # Initialize variables + package_name = "" + failures_count = 0 + errors_count = 0 + warnings_count = 0 + uds_capability_wide_errors_count = 0 + failure_descriptions = [] + error_descriptions = [] + warning_descriptions = [] + uds_capability_wide_error_descriptions = [] + previous_package_name = None + + # Process each line + for line in clean_data.splitlines(): + # Remove leading and trailing whitespace + line = line.strip() + + # Match and extract the package name + match = re.match(r"^ℹ️\s+Package\s+Name:\s+(.*)$", line) + if match: + # Print the previous package's info before starting a new one + if previous_package_name is not None: + print_package_info( + previous_package_name, + failures_count, + errors_count, + warnings_count, + failure_descriptions, + error_descriptions, + warning_descriptions, + uds_capability_wide_errors_count, + ) + # Reset variables for the new package + package_name = match.group(1) + failures_count = 0 + errors_count = 0 + warnings_count = 0 + failure_descriptions = [] + error_descriptions = [] + warning_descriptions = [] + previous_package_name = package_name + continue + + if uds_capability_wide_errors(line): + uds_capability_wide_errors_count = 1 + uds_capability_wide_error_descriptions = [ + "Not all pods have the istio sidecar" + ] + continue + else: + # Match and extract counts for failures, errors, and warnings + match = re.match(r"^(❌|⚠️|⛔)\s+(\d+)\s+([a-z]+)\s+found$", line) + if match: + count = int(match.group(2)) + type_ = match.group(3) + if type_ == "errors": + errors_count = count + elif type_ == "warnings": + warnings_count = count + elif type_ == "failures": + failures_count = count + continue + + # Match and collect issue descriptions + match = re.match(r"^(❌|⚠️|⛔)\s+(.*)$", line) + if match: + emoji = match.group(1) + description = match.group(2) + if emoji == "❌": + error_descriptions.append(description) + elif emoji == "⚠️": + warning_descriptions.append(description) + elif emoji == "⛔": + failure_descriptions.append(description) + continue + + # Print the last package's information + if previous_package_name is not None: + print_package_info( + previous_package_name, + failures_count, + errors_count, + warnings_count, + failure_descriptions, + error_descriptions, + warning_descriptions, + uds_capability_wide_errors_count, + ) + if uds_capability_wide_errors_count >= 1: + print("-----------------------------") + if os.getenv("CI") == "true": + print("::group::UDS Capability-Wide Issues") + print("::error::", end="") + print("UDS Capability Issues") + print("\n❌ Error Descriptions:") + for desc in uds_capability_wide_error_descriptions: + print(f" - {desc}") + if os.getenv("CI") == "true": + print("::endgroup::") + + +if __name__ == "__main__": + main() + # Print the final ending separator + print("-----------------------------") diff --git a/.github/workflows/e2e-llama-cpp-python.yaml b/.github/workflows/e2e-llama-cpp-python.yaml index e3d573bba..b3019819f 100644 --- a/.github/workflows/e2e-llama-cpp-python.yaml +++ b/.github/workflows/e2e-llama-cpp-python.yaml @@ -32,6 +32,7 @@ on: # Ignore local development files - "!.pre-commit-config.yaml" + - "!tasks.yaml" # Ignore non e2e tests changes - "!tests/pytest/**" @@ -56,6 +57,11 @@ jobs: runs-on: ai-ubuntu-big-boy-8-core if: ${{ !github.event.pull_request.draft }} + permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. + steps: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -69,6 +75,7 @@ jobs: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - name: Setup API and Supabase uses: ./.github/actions/lfai-core diff --git a/.github/workflows/e2e-playwright.yaml b/.github/workflows/e2e-playwright.yaml index 7200155fe..ddf9da1c8 100644 --- a/.github/workflows/e2e-playwright.yaml +++ b/.github/workflows/e2e-playwright.yaml @@ -34,6 +34,7 @@ on: # Ignore local development files - "!.pre-commit-config.yaml" + - "!tasks.yaml" # Ignore non e2e tests changes - "!tests/pytest/**" @@ -57,6 +58,11 @@ jobs: runs-on: ai-ubuntu-big-boy-8-core if: ${{ !github.event.pull_request.draft }} + permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. + steps: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -82,6 +88,7 @@ jobs: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - name: Create Test User run: | @@ -120,7 +127,7 @@ jobs: - name: UI/API/Supabase E2E Playwright Tests run: | cp src/leapfrogai_ui/.env.example src/leapfrogai_ui/.env - rm src/leapfrogai_ui/tests/global.teardown.ts + rm src/leapfrogai_ui/tests/global.teardown.ts mkdir -p src/leapfrogai_ui/playwright/.auth SERVICE_ROLE_KEY=$(uds zarf tools kubectl get secret -n leapfrogai supabase-bootstrap-jwt -o jsonpath={.data.service-key} | base64 -d) echo "::add-mask::$SERVICE_ROLE_KEY" diff --git a/.github/workflows/e2e-text-backend-full-cpu.yaml b/.github/workflows/e2e-text-backend-full-cpu.yaml index 6e8507ae3..9e7faf01f 100644 --- a/.github/workflows/e2e-text-backend-full-cpu.yaml +++ b/.github/workflows/e2e-text-backend-full-cpu.yaml @@ -32,6 +32,7 @@ on: # Ignore local development files - "!.pre-commit-config.yaml" + - "!tasks.yaml" # Ignore non e2e tests changes - "!tests/pytest/**" @@ -57,6 +58,11 @@ jobs: runs-on: ai-ubuntu-big-boy-8-core if: ${{ !github.event.pull_request.draft }} + permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. + steps: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -69,6 +75,8 @@ jobs: with: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - name: Setup LFAI-API and Supabase uses: ./.github/actions/lfai-core @@ -97,5 +105,7 @@ jobs: # Test ########## - name: Test Text Backend + env: + LEAPFROGAI_MODEL: llama-cpp-python run: | python -m pytest ./tests/e2e/test_text_backend_full.py -v diff --git a/.github/workflows/e2e-text-embeddings.yaml b/.github/workflows/e2e-text-embeddings.yaml index 20f7eb97a..3742de352 100644 --- a/.github/workflows/e2e-text-embeddings.yaml +++ b/.github/workflows/e2e-text-embeddings.yaml @@ -32,6 +32,7 @@ on: # Ignore local development files - "!.pre-commit-config.yaml" + - "!tasks.yaml" # Ignore non e2e tests changes - "!tests/pytest/**" @@ -58,6 +59,11 @@ jobs: runs-on: ai-ubuntu-big-boy-8-core if: ${{ !github.event.pull_request.draft }} + permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. + steps: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -71,6 +77,7 @@ jobs: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - name: Setup LFAI-API and Supabase uses: ./.github/actions/lfai-core diff --git a/.github/workflows/e2e-vllm.yaml b/.github/workflows/e2e-vllm.yaml index 07e9f046f..6f89948ad 100644 --- a/.github/workflows/e2e-vllm.yaml +++ b/.github/workflows/e2e-vllm.yaml @@ -32,6 +32,7 @@ on: # Ignore local development files - "!.pre-commit-config.yaml" + - "!tasks.yaml" # Ignore non e2e tests changes - "!tests/pytest/**" @@ -58,6 +59,11 @@ jobs: runs-on: ai-ubuntu-big-boy-8-core if: ${{ !github.event.pull_request.draft }} + permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. + steps: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -73,7 +79,7 @@ jobs: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} - udsCliVersion: 0.14.0 + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} ########## # vLLM @@ -82,4 +88,4 @@ jobs: ########## - name: Build vLLM run: | - make build-vllm LOCAL_VERSION=e2e-test + make build-vllm LOCAL_VERSION=e2e-test ZARF_CONFIG=packages/vllm/zarf-config.yaml diff --git a/.github/workflows/e2e-whisper.yaml b/.github/workflows/e2e-whisper.yaml index dee2cf45a..90e94106e 100644 --- a/.github/workflows/e2e-whisper.yaml +++ b/.github/workflows/e2e-whisper.yaml @@ -32,6 +32,7 @@ on: # Ignore local development files - "!.pre-commit-config.yaml" + - "!tasks.yaml" # Ignore non e2e tests changes - "!tests/pytest/**" @@ -56,6 +57,11 @@ jobs: runs-on: ai-ubuntu-big-boy-8-core if: ${{ !github.event.pull_request.draft }} + permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. + steps: - name: Checkout Repo uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -71,6 +77,7 @@ jobs: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - name: Setup LFAI-API and Supabase uses: ./.github/actions/lfai-core diff --git a/.github/workflows/nightly-snapshot-release.yaml b/.github/workflows/nightly-snapshot-release.yaml index 7be66a934..da6abcdef 100644 --- a/.github/workflows/nightly-snapshot-release.yaml +++ b/.github/workflows/nightly-snapshot-release.yaml @@ -6,11 +6,11 @@ on: workflow_dispatch: # trigger manually as needed pull_request: types: - - opened # default trigger - - reopened # default trigger - - synchronize # default trigger - - ready_for_review # don't run on draft PRs - - milestoned # allows us to trigger on bot PRs + - opened # default trigger + - reopened # default trigger + - synchronize # default trigger + - ready_for_review # don't run on draft PRs + - milestoned # allows us to trigger on bot PRs paths: - .github/workflows/nightly-snapshot-release.yaml @@ -23,10 +23,8 @@ defaults: shell: bash env: - # x-release-please-start-version - LEAPFROGAI_VERSION: 0.13.0 - # x-release-please-end SNAPSHOT_VERSION: snapshot-latest + SNAPSHOT_SUB_REPOSITORY: /uds/snapshots/ permissions: contents: read @@ -47,7 +45,7 @@ jobs: uses: ./.github/actions/release with: releaseTag: ${{ env.SNAPSHOT_VERSION }} - subRepository: /uds/snapshots/ + subRepository: ${{ env.SNAPSHOT_SUB_REPOSITORY }} registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} @@ -65,19 +63,29 @@ jobs: id-token: write # This is needed for OIDC federation. steps: - - name: Checkout Repo (v${{ env.LEAPFROGAI_VERSION }}) + # Checkout main just to see the latest release in the release-please manifest + - name: Checkout Repo (main) uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - ref: v${{ env.LEAPFROGAI_VERSION }} - - - name: Setup Python (v${{ env.LEAPFROGAI_VERSION }}) - uses: ./.github/actions/python + ref: main - - name: Install Dev Dependencies + - name: Get Latest Release Version + id: get_version run: | - python -m pip install ".[dev]" ".[dev-vllm]" ".[dev-whisper]" + LFAI_VERSION=$(jq -r '.["."]' .github/.release-please-manifest.json) + echo "LFAI_VERSION=$LFAI_VERSION" >> $GITHUB_OUTPUT + + ################ + # LATEST RELEASE + ################ + + # Checkout the latest release in the release-please manifest + - name: Checkout Repo (v${{ steps.get_version.outputs.LFAI_VERSION }}) + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + ref: v${{ steps.get_version.outputs.LFAI_VERSION }} - - name: Setup UDS Cluster + - name: Setup UDS Cluster (v${{ steps.get_version.outputs.LFAI_VERSION }}) uses: ./.github/actions/uds-cluster with: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} @@ -85,38 +93,40 @@ jobs: ghToken: ${{ secrets.GITHUB_TOKEN }} chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - # This is needed due to delay in tagged release versus - # package publishing and the latest versions of each package in the UDS bundle - - name: Mutation of the UDS Bundle + # This is needed due to delay in tagged releases versus the version refs within the UDS bundles + - name: Mutation of the UDS Bundle (v${{ steps.get_version.outputs.LFAI_VERSION }}) run: | - uds zarf tools yq -i '.metadata.version = "v${{ env.LEAPFROGAI_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml + uds zarf tools yq -i '.metadata.version = "v${{ steps.get_version.outputs.LFAI_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml - uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ env.LEAPFROGAI_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml + uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ steps.get_version.outputs.LFAI_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml - - name: Create and Deploy UDS Bundle (v${{ env.LEAPFROGAI_VERSION }}) + - name: Create and Deploy UDS Bundle (v${{ steps.get_version.outputs.LFAI_VERSION }}) run: | cd bundles/latest/cpu uds create . --confirm && \ - uds deploy uds-bundle-leapfrogai-amd64-v${{ env.LEAPFROGAI_VERSION }}.tar.zst --confirm --no-progress && \ - rm -rf uds-bundle-leapfrogai-amd64-v${{ env.LEAPFROGAI_VERSION }}.tar.zst && \ + uds deploy uds-bundle-leapfrogai-amd64-v${{ steps.get_version.outputs.LFAI_VERSION }}.tar.zst --confirm --no-progress && \ + rm -rf uds-bundle-leapfrogai-amd64-v${{ steps.get_version.outputs.LFAI_VERSION }}.tar.zst && \ docker system prune -af + ################# + # MAIN (SNAPSHOT) + ################# + - name: Checkout Repo (main) uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: ref: main - - name: Print the Commit SHA + - name: Print the Commit SHA (main) run: | COMMIT_SHA=$(git rev-parse HEAD) echo "The latest commit on the main branch is: $COMMIT_SHA" - - name: Install Dev Dependencies (main) - run: | - python -m pip install ".[dev]" ".[dev-vllm]" ".[dev-whisper]" --force-reinstall --no-cache-dir + - name: Setup Python (main) + uses: ./.github/actions/python # Set UDS CPU bundle refs and repositories to snapshot-latest - - name: Mutation of the UDS Bundle + - name: Mutation of the UDS Bundle (main) run: | uds zarf tools yq -i '.metadata.version = "${{ env.SNAPSHOT_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml @@ -124,7 +134,7 @@ jobs: uds zarf tools yq -i '.packages[].repository |= sub("/uds/", "/uds/snapshots/")' bundles/latest/cpu/uds-bundle.yaml - - name: Create and Deploy UDS Bundle (${{ env.SNAPSHOT_VERSION }}) + - name: Create and Deploy UDS Bundle (main) run: | cd bundles/latest/cpu uds create . --confirm && \ @@ -132,6 +142,10 @@ jobs: rm -rf uds-bundle-leapfrogai-amd64-${{ env.SNAPSHOT_VERSION }}.tar.zst && \ docker system prune -af + ######### + # TESTING + ######### + - name: Generate Secrets id: generate_secrets run: | @@ -156,6 +170,7 @@ jobs: env: ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }} SERVICE_KEY: ${{ steps.generate_secrets.outputs.SERVICE_KEY }} + LEAPFROGAI_MODEL: llama-cpp-python run: | python -m pytest -vvv -s ./tests/e2e diff --git a/.github/workflows/nightly-uds-badge-verification.yaml b/.github/workflows/nightly-uds-badge-verification.yaml new file mode 100644 index 000000000..6be419ebb --- /dev/null +++ b/.github/workflows/nightly-uds-badge-verification.yaml @@ -0,0 +1,94 @@ +name: nightly-uds-badge-verification + +on: + schedule: + - cron: "0 11 * * *" # Runs daily at 3 AM PST + workflow_dispatch: # trigger manually as needed + pull_request: + paths: + - .github/workflows/nightly-uds-badge-verification.yaml + - tasks.yaml + +concurrency: + group: nightly-uds-badge-verification-${{ github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash + +env: + SNAPSHOT_VERSION: snapshot-latest + +permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. + +jobs: + uds-badge-verification: + runs-on: ai-ubuntu-big-boy-8-core + name: nightly_uds_badge_verification + + steps: + - name: Checkout Repo + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + ref: main + + - name: Setup UDS Cluster + uses: ./.github/actions/uds-cluster + with: + registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} + registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} + ghToken: ${{ secrets.GITHUB_TOKEN }} + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} + + - name: Print the Commit SHA + run: | + COMMIT_SHA=$(git rev-parse HEAD) + echo "The latest commit on the main branch is: $COMMIT_SHA" + + # Set UDS CPU bundle refs and repositories to snapshot-latest + - name: Mutation of the UDS Bundle + run: | + uds zarf tools yq -i '.metadata.version = "${{ env.SNAPSHOT_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml + + uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ env.SNAPSHOT_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml + + uds zarf tools yq -i '.packages[].repository |= sub("/uds/", "/uds/snapshots/")' bundles/latest/cpu/uds-bundle.yaml + + - name: Create and Deploy UDS Bundle (${{ env.SNAPSHOT_VERSION }}) + run: | + cd bundles/latest/cpu + uds create . --confirm && \ + uds deploy uds-bundle-leapfrogai-amd64-${{ env.SNAPSHOT_VERSION }}.tar.zst --confirm --no-progress && \ + rm -rf uds-bundle-leapfrogai-amd64-${{ env.SNAPSHOT_VERSION }}.tar.zst && \ + docker system prune -af + + # Workaround for handling emojis in the upstream badge verification UDS task + - name: Set Locale to UTF-8 + run: | + sudo apt-get update + sudo apt-get install -y locales + sudo locale-gen en_US.UTF-8 + export LANG=en_US.UTF-8 + export LANGUAGE=en_US:en + export LC_ALL=en_US.UTF-8 + + # Setup Python for the report cleaning script in the next step + - name: Set up Python + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version-file: "pyproject.toml" + + - name: Run UDS Badge Verification Task + run: | + uds run nightly-uds-badge-verification --no-progress + + - name: Archive UDS Badge Verification Report + uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6 + with: + name: uds-badge-verification-report + path: reports + retention-days: 7 diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index 93d0f0832..fec72192b 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -31,7 +31,10 @@ on: - "!packages/ui/**" # Declare default permissions as read only. -permissions: read-all +permissions: + contents: read + packages: read + id-token: write # This is needed for OIDC federation. concurrency: group: pytest-integration-${{ github.ref }} @@ -64,6 +67,7 @@ jobs: run: make test-api-unit env: LFAI_RUN_REPEATER_TESTS: true + DEV: true integration: runs-on: ai-ubuntu-big-boy-8-core @@ -97,6 +101,7 @@ jobs: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - name: Setup API and Supabase uses: ./.github/actions/lfai-core diff --git a/.github/workflows/uds-lint.yaml b/.github/workflows/uds-lint.yaml index 8f2e6834c..cf7050cb8 100644 --- a/.github/workflows/uds-lint.yaml +++ b/.github/workflows/uds-lint.yaml @@ -46,3 +46,11 @@ jobs: run: | check-jsonschema bundles/latest/gpu/uds-bundle.yaml --schemafile uds.schema.json check-jsonschema bundles/latest/cpu/uds-bundle.yaml --schemafile uds.schema.json + + - name: Download UDS Tasks Schema + run: curl -o tasks.schema.json https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.14.0/tasks.schema.json + + - name: Validate tasks.yaml + if: always() + run: | + check-jsonschema tasks.yaml --schemafile tasks.schema.json diff --git a/.github/workflows/e2e-registry1-weekly.yaml b/.github/workflows/weekly-registry1-flavor-test.yaml similarity index 56% rename from .github/workflows/e2e-registry1-weekly.yaml rename to .github/workflows/weekly-registry1-flavor-test.yaml index 65f4c5897..21d799c9b 100644 --- a/.github/workflows/e2e-registry1-weekly.yaml +++ b/.github/workflows/weekly-registry1-flavor-test.yaml @@ -1,8 +1,8 @@ -name: e2e-registry1-weekly +name: weekly-registry1-flavor-test on: schedule: - - cron: "0 0 * * 6" # Run every Sunday at 12 AM EST + - cron: "0 8 * * 0" # Run every Sunday at 12 AM PST workflow_dispatch: # trigger manually as needed pull_request: types: @@ -12,11 +12,11 @@ on: - ready_for_review # don't run on draft PRs - milestoned # allows us to trigger on bot PRs paths: - - .github/workflows/e2e-registry1-weekly.yaml + - .github/workflows/weekly-registry1-flavor-test.yaml - bundles/latest/** concurrency: - group: e2e-registry1-weekly-${{ github.ref }} + group: weekly-registry1-flavor-test-${{ github.ref }} cancel-in-progress: true defaults: @@ -24,67 +24,98 @@ defaults: shell: bash jobs: - test-flavors: + registry1-flavor-test: runs-on: ai-ubuntu-big-boy-8-core - name: e2e_registry1_weekly + name: weekly_registry1_flavor_test if: ${{ !github.event.pull_request.draft }} permissions: contents: read - packages: write + packages: read id-token: write # This is needed for OIDC federation. steps: - - name: Checkout Repo + # Checkout main just to see the latest release in the release-please manifest + - name: Checkout Repo (main) uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: - # x-release-please-start-version - ref: "caf4f9c3093a55a003b49fcbf05c03221be6a232" # 0.12.2 w/ integration tests turned-on - # x-release-please-end + ref: main - - name: Setup Python - uses: ./.github/actions/python + - name: Get Latest Release Version + id: get_version + run: | + LFAI_VERSION=$(jq -r '.["."]' .github/.release-please-manifest.json) + echo "LFAI_VERSION=$LFAI_VERSION" >> $GITHUB_OUTPUT - - name: Install API and SDK Dev Dependencies - run : | - make install + ################ + # LATEST RELEASE + ################ + + - name: Checkout Repo + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + fetch-tags: true + ref: v${{ steps.get_version.outputs.LFAI_VERSION }} - - name: Setup UDS Cluster - uses: ./.github/actions/uds-cluster + - name: Setup UDS Environment + uses: defenseunicorns/uds-common/.github/actions/setup@24c8a2a48eeb33773b76b3587c489cb17496c9e0 # v0.12.0 with: registry1Username: ${{ secrets.IRON_BANK_ROBOT_USERNAME }} registry1Password: ${{ secrets.IRON_BANK_ROBOT_PASSWORD }} ghToken: ${{ secrets.GITHUB_TOKEN }} - udsCliVersion: 0.14.0 + chainguardIdentity: ${{ secrets.CHAINGUARD_IDENTITY }} - - name: Create UDS Cluster - shell: bash + - name: Setup Python + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c #v5.0.0 + with: + python-version-file: "pyproject.toml" + + - name: Install Python Dependencies + run: pip install ".[dev]" "src/leapfrogai_api" "src/leapfrogai_sdk" --no-cache-dir + + - name: Mutation of the Zarf Packages run: | - UDS_CONFIG=.github/config/uds-config.yaml make create-uds-cpu-cluster + uds zarf tools yq -i ' + .components[].images[0] |= sub(":v[0-9\.]+$", ":v${{ steps.get_version.outputs.LFAI_VERSION }}") + ' packages/api/zarf.yaml + uds zarf tools yq -i '.api.image.tag = "v${{ steps.get_version.outputs.LFAI_VERSION }}"' packages/api/values/registry1-values.yaml - - name: Setup Playwright + - name: Print the Modified Zarf Packages run: | - npm --prefix src/leapfrogai_ui ci - npx --prefix src/leapfrogai_ui playwright install + cat packages/api/zarf.yaml + cat packages/api/values/registry1-values.yaml - - name: Create Registry1 Packages + - name: Create Registry1 Zarf Packages run: | - LOCAL_VERSION=registry1 FLAVOR=registry1 make build-api + uds zarf package create packages/api --set image_version="${{ steps.get_version.outputs.LFAI_VERSION }}" --flavor registry1 -a amd64 --confirm # Mutate UDS bundle definition to use Registry1 packages - - name: Mutation to Registry1 Bundle - # TODO: fix bundle path + # Mutate non-Registry1 packages to be the current tagged version + - name: Mutation of the UDS Bundle run: | - uds zarf tools yq -i '.packages[1] |= del(.repository)' bundles/latest/cpu/uds-bundle.yaml - uds zarf tools yq -i '.packages[1] |= .ref = "registry1"' bundles/latest/cpu/uds-bundle.yaml - uds zarf tools yq -i '.packages[1] |= .path = "../../../packages/api"' bundles/latest/cpu/uds-bundle.yaml uds zarf tools yq -i '.metadata.version = "registry1"' bundles/latest/cpu/uds-bundle.yaml - - name: Create and Deploy Bundle + uds zarf tools yq -i '.packages[].ref |= sub("^[^ ]+-upstream$", "${{ steps.get_version.outputs.LFAI_VERSION }}-upstream")' bundles/latest/cpu/uds-bundle.yaml + + uds zarf tools yq -i '.packages[1] |= del(.repository)' bundles/latest/cpu/uds-bundle.yaml + uds zarf tools yq -i '.packages[1] |= .ref = "${{ steps.get_version.outputs.LFAI_VERSION }}"' bundles/latest/cpu/uds-bundle.yaml + uds zarf tools yq -i '.packages[1] |= .path = "../../../"' bundles/latest/cpu/uds-bundle.yaml + + - name: Print the Modified UDS Bundle + run: | + cat bundles/latest/cpu/uds-config.yaml + cat bundles/latest/cpu/uds-bundle.yaml + + - name: Create UDS Cluster + shell: bash + run: | + UDS_CONFIG=.github/config/uds-config.yaml make create-uds-cpu-cluster + + - name: Create and Deploy Registry1 Bundle run: | cd bundles/latest/cpu uds create . --confirm && \ - uds deploy uds-bundle-leapfrogai-amd64-registry1.tar.zst --confirm --no-progress && \ + uds deploy uds-bundle-leapfrogai-amd64-registry1.tar.zst --confirm --no-progress --log-level debug && \ rm -rf uds-bundle-leapfrogai-amd64-registry1.tar.zst && \ docker system prune -af @@ -107,32 +138,19 @@ jobs: echo "ANON_KEY is set: ${{ steps.generate_secrets.outputs.ANON_KEY != '' }}" echo "SERVICE_KEY is set: ${{ steps.generate_secrets.outputs.SERVICE_KEY != '' }}" - - name: Run Integration Tests - env: - SUPABASE_ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }} - SUPABASE_PASS: ${{ steps.generate_secrets.outputs.FAKE_PASSWORD }} - SUPABASE_EMAIL: integration@uds.dev - SUPABASE_URL: https://supabase-kong.uds.dev - # Turn off NIAH tests that are not applicable for integration testing using the Repeater model - LFAI_RUN_NIAH_TESTS: "false" - run: | - uds zarf connect --name=llama-cpp-python-model --namespace=leapfrogai --local-port=50051 --remote-port=50051 & - while ! nc -z localhost 50051; do sleep 1; done - - make test-user-pipeline - env $(cat .env | xargs) python -m pytest -v -s tests/integration/api - # Backends - name: Run Backend E2E Tests env: ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }} SERVICE_KEY: ${{ steps.generate_secrets.outputs.SERVICE_KEY }} + LEAPFROGAI_MODEL: llama-cpp-python + run: | + python -m pytest -vvv -s ./tests/e2e + + - name: Setup Playwright run: | - python -m pytest ./tests/e2e/test_llama.py -vv - python -m pytest ./tests/e2e/test_text_embeddings.py -vv - python -m pytest ./tests/e2e/test_whisper.py -vv - python -m pytest ./tests/e2e/test_supabase.py -vv - python -m pytest ./tests/e2e/test_api.py -vv + npm --prefix src/leapfrogai_ui ci + npx --prefix src/leapfrogai_ui playwright install - name: Run Playwright E2E Tests env: @@ -156,3 +174,12 @@ jobs: name: playwright-report path: src/leapfrogai_ui/e2e-report/ retention-days: 30 + + - name: Get Cluster Debug Information + id: debug + if: ${{ !cancelled() }} + uses: defenseunicorns/uds-common/.github/actions/debug-output@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 + + - name: Get Cluster Debug Information + if: ${{ !cancelled() && steps.debug.conclusion == 'success' }} + uses: defenseunicorns/uds-common/.github/actions/save-logs@e3008473beab00b12a94f9fcc7340124338d5c08 # v0.13.1 diff --git a/.gitignore b/.gitignore index 645bd6ff5..d0c8a20f3 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ node_modules package.json package-lock.json **/*.schema.json +reports # local model and tokenizer files *.bin diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6caadd6c8..401bcba03 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -137,3 +137,26 @@ repos: files: "uds-bundle.yaml" types: [yaml] args: ["--schemafile", "uds-v0.14.0.schema.json"] + + # UDS TASKS CHECK + - repo: local + hooks: + - id: download-schema + name: "Download UDS Tasks Schema" + entry: | + bash -c 'FILE="tasks-v0.14.0.schema.json" + if [ -f "$(git rev-parse --show-toplevel)/$FILE" ]; then + echo "$FILE already exists in the root of the git project, skipping download." + else + curl -o "$FILE" https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.14.0/tasks.schema.json + fi' + language: system + + - repo: https://github.com/python-jsonschema/check-jsonschema + rev: 0.14.0 + hooks: + - id: check-jsonschema + name: "Validate UDS Bundles Against Schema" + files: "tasks.yaml" + types: [yaml] + args: ["--schemafile", "tasks-v0.14.0.schema.json"] diff --git a/Makefile b/Makefile index bf8afb315..da9266246 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ ARCH ?= amd64 +FLAVOR ?= upstream REG_PORT ?= 5000 REG_NAME ?= registry LOCAL_VERSION ?= $(shell git rev-parse --short HEAD) DOCKER_FLAGS := ZARF_FLAGS := -FLAVOR := upstream SILENT_DOCKER_FLAGS := --quiet SILENT_ZARF_FLAGS := --no-progress -l warn --no-color MAX_JOBS := 4 @@ -55,24 +55,34 @@ build-supabase: local-registry docker-supabase docker-api: local-registry sdk-wheel @echo $(DOCKER_FLAGS) @echo $(ZARF_FLAGS) -ifeq ($(FLAVOR),upstream) + ## Build the API image (and tag it for the local registry) docker build ${DOCKER_FLAGS} --platform=linux/${ARCH} --build-arg LOCAL_VERSION=${LOCAL_VERSION} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION} -f packages/api/Dockerfile . docker tag ghcr.io/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION} localhost:${REG_PORT}/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION} -endif + ## Build the migration container for this version of the API docker build ${DOCKER_FLAGS} --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION} -f Dockerfile.migrations --build-arg="MIGRATIONS_DIR=packages/api/supabase/migrations" . docker tag ghcr.io/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION} localhost:${REG_PORT}/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION} -build-api: local-registry docker-api ## Build the leapfrogai_api container and Zarf package +## If registry1, don't locally Docker-build anything +ifeq ($(FLAVOR),upstream) + DOCKER_TARGETS := local-registry docker-api +else + DOCKER_TARGETS := +endif + +build-api: $(DOCKER_TARGETS) ## Build the leapfrogai_api container and Zarf package + ## Only push to local registry and build if this is an upstream-flavored package ifeq ($(FLAVOR),upstream) ## Push the images to the local registry (Zarf is super slow if the image is only in the local daemon) docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/leapfrogai-api:${LOCAL_VERSION} -endif docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/api-migrations:${LOCAL_VERSION} - ## Build the Zarf package uds zarf package create packages/api --flavor ${FLAVOR} -a ${ARCH} -o packages/api --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm +else + ## Build the registry1 Zarf package + ZARF_CONFIG=packages/api/zarf-config.yaml uds zarf package create packages/api --flavor ${FLAVOR} -a ${ARCH} -o packages/api ${ZARF_FLAGS} --confirm +endif docker-ui: ## Build the UI image (and tag it for the local registry) @@ -113,7 +123,7 @@ build-vllm: local-registry docker-vllm ## Build the vllm container and Zarf pack docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} ## Build the Zarf package - uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm + ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm docker-text-embeddings: sdk-wheel ## Build the image (and tag it for the local registry) @@ -253,7 +263,7 @@ silent-deploy-llama-cpp-python-package: silent-deploy-vllm-package: @echo "Starting VLLM deployment..." @mkdir -p .logs - @uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1 + @ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1 @echo "VLLM deployment completed" silent-deploy-text-embeddings-package: diff --git a/README.md b/README.md index 7c09b075b..4e4b1c161 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ ![LeapfrogAI](https://github.com/defenseunicorns/leapfrogai/raw/main/docs/imgs/leapfrogai.png) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/defenseunicorns/leapfrogai/badge)](https://api.securityscorecards.dev/projects/github.com/defenseunicorns/leapfrogai) +[![Nightly Snapshot Tests](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-snapshot-release.yaml/badge.svg)](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-snapshot-release.yaml) +[![Nightly Made for UDS Test](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-uds-badge-verification.yaml/badge.svg)](https://github.com/defenseunicorns/leapfrogai/actions/workflows/nightly-uds-badge-verification.yaml) +[![Weekly Registry1 Test](https://github.com/defenseunicorns/leapfrogai/actions/workflows/weekly-registry1-flavor-test.yaml/badge.svg)](https://github.com/defenseunicorns/leapfrogai/actions/workflows/weekly-registry1-flavor-test.yaml) ## Table of Contents diff --git a/bundles/dev/gpu/uds-config.yaml b/bundles/dev/gpu/uds-config.yaml index 9ad6cfdb4..1ef7a2634 100644 --- a/bundles/dev/gpu/uds-config.yaml +++ b/bundles/dev/gpu/uds-config.yaml @@ -9,8 +9,31 @@ variables: gpu_limit: 0 # runs on CPU until GPU limit is increased vllm: - gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only - #tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in + trust_remote_code: "True" + tensor_parallel_size: "1" + enforce_eager: "False" + gpu_memory_utilization: "0.90" + worker_use_ray: "True" + engine_use_ray: "True" + quantization: "None" + load_format: "auto" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + max_context_length: "32768" + stop_tokens: ", <|im_end|>, <|endoftext|>" + prompt_format_chat_system: "SYSTEM: {}\n" + prompt_format_chat_user: "USER: {}\n" + prompt_format_chat_assistant: "ASSISTANT: {}\n" + temperature: "0.1" + top_p: "1.0" + top_k: "0" + repetition_penalty: "1.0" + max_new_tokens: "8192" + # Pod deployment configuration + gpu_limit: "1" + gpu_runtime: "nvidia" + pvc_size: "15Gi" + pvc_access_mode: "ReadWriteOnce" + pvc_storage_class: "local-path" supabase: domain: "uds.dev" diff --git a/bundles/latest/cpu/uds-bundle.yaml b/bundles/latest/cpu/uds-bundle.yaml index 747645ae3..00327dbec 100644 --- a/bundles/latest/cpu/uds-bundle.yaml +++ b/bundles/latest/cpu/uds-bundle.yaml @@ -4,35 +4,35 @@ kind: UDSBundle metadata: name: leapfrogai description: A UDS bundle for deploying LeapfrogAI - version: 0.12.2-upstream + version: 0.13.1-upstream packages: # Supabase backend for the UI and API to interface with Postgresql - name: supabase repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/supabase - ref: 0.12.2-upstream + ref: 0.13.1-upstream # API - name: leapfrogai-api repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-api - ref: 0.12.2-upstream + ref: 0.13.1-upstream # Chat Model - name: llama-cpp-python repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/llama-cpp-python - ref: 0.12.2-upstream + ref: 0.13.1-upstream # Text Embeddings Model - name: text-embeddings repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/text-embeddings - ref: 0.12.2-upstream + ref: 0.13.1-upstream # Transcription Model - name: whisper repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/whisper - ref: 0.12.2-upstream + ref: 0.13.1-upstream # UI - name: leapfrogai-ui repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-ui - ref: 0.12.2-upstream + ref: 0.13.1-upstream diff --git a/bundles/latest/gpu/uds-bundle.yaml b/bundles/latest/gpu/uds-bundle.yaml index 3867749a4..ab2a9e0f5 100644 --- a/bundles/latest/gpu/uds-bundle.yaml +++ b/bundles/latest/gpu/uds-bundle.yaml @@ -4,35 +4,35 @@ kind: UDSBundle metadata: name: leapfrogai description: A UDS bundle for deploying LeapfrogAI - version: 0.12.2-upstream + version: 0.13.1-upstream packages: # Supabase backend for the UI and API to interface with Postgresql - name: supabase repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/supabase - ref: 0.12.2-upstream + ref: 0.13.1-upstream # OpenAI-like API - name: leapfrogai-api repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-api - ref: 0.12.2-upstream + ref: 0.13.1-upstream # Model for generic chat and summarization - name: vllm repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/vllm - ref: 0.12.2-upstream + ref: 0.13.1-upstream # Model for providing vector embeddings for text - name: text-embeddings repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/text-embeddings - ref: 0.12.2-upstream + ref: 0.13.1-upstream # Model for converting audio to text - name: whisper repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/whisper - ref: 0.12.2-upstream + ref: 0.13.1-upstream # UI - name: leapfrogai-ui repository: ghcr.io/defenseunicorns/packages/uds/leapfrogai/leapfrogai-ui - ref: 0.12.2-upstream + ref: 0.13.1-upstream diff --git a/bundles/latest/gpu/uds-config.yaml b/bundles/latest/gpu/uds-config.yaml index 9ad6cfdb4..1ef7a2634 100644 --- a/bundles/latest/gpu/uds-config.yaml +++ b/bundles/latest/gpu/uds-config.yaml @@ -9,8 +9,31 @@ variables: gpu_limit: 0 # runs on CPU until GPU limit is increased vllm: - gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only - #tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in + trust_remote_code: "True" + tensor_parallel_size: "1" + enforce_eager: "False" + gpu_memory_utilization: "0.90" + worker_use_ray: "True" + engine_use_ray: "True" + quantization: "None" + load_format: "auto" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + max_context_length: "32768" + stop_tokens: ", <|im_end|>, <|endoftext|>" + prompt_format_chat_system: "SYSTEM: {}\n" + prompt_format_chat_user: "USER: {}\n" + prompt_format_chat_assistant: "ASSISTANT: {}\n" + temperature: "0.1" + top_p: "1.0" + top_k: "0" + repetition_penalty: "1.0" + max_new_tokens: "8192" + # Pod deployment configuration + gpu_limit: "1" + gpu_runtime: "nvidia" + pvc_size: "15Gi" + pvc_access_mode: "ReadWriteOnce" + pvc_storage_class: "local-path" supabase: domain: "uds.dev" diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 897bfaf5d..98343ef7f 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -13,20 +13,20 @@ Please first see the pre-requisites listed on the LeapfrogAI documentation websi It is **_HIGHLY RECOMMENDED_** that PyEnv be installed on your machine, and a new virtual environment is created for every new development branch. -Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.6: +Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.9: ```bash # install the correct python version - pyenv install 3.11.6 + pyenv install 3.11.9 # create a new virtual environment named "leapfrogai" - pyenv virtualenv 3.11.6 leapfrogai + pyenv virtualenv 3.11.9 leapfrogai # activate the virtual environment pyenv activate leapfrogai ``` -If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.6: +If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.9: ```bash sudo apt-get install build-essential zlib1g-dev libffi-dev \ @@ -62,6 +62,52 @@ Many of the directories and sub-directories within this project contain Make tar Please refer to each Makefile for more arguments and details on what each target does and is dependent on. +## UDS Tasks + +UDS tasks use the UDS CLI runner, and are defined in the root `tasks.yaml` file. + +Currently, the only tasks within the file are for checking the progress of the LeapfrogAI towards the `Made for UDS` packaging standards. To run the task verification task you must have a [UDS Kubernetes cluster](../packages/k3d-gpu/README.md) and LeapfrogAI (GPU or CPU) deployed. After deploying both major capabilities, you can execute the following: + +```bash +uds run nightly-uds-badge-verification --no-progress +``` + +You should get an output similar to this, depending on how many components of LeapfrogAI are actually deployed: + +```bash + • Running "Create Reports Directory" + + ✔ Completed "Create Reports Directory" + + • Running "Run UDS Badge Verification Task" + + ✔ Completed "Run UDS Badge Verification Task" + + • Running "Clean Up Final Report" +----------------------------- +Package: leapfrogai-api + +❌ Errors: 4 +⚠️ Warnings: 3 + +❌ Error Descriptions: + - Endpoint leapfrogai-api.uds.dev is returning 404 + - Not all applicable network policies are using selectors + - Not all applicable network policies are using ports + - No monitors defined + +⚠️ Warning Descriptions: + - Version is not consistent across flavors and package + - Network policies with 'remoteGenerated: Anywhere' are present, review needed + - No SSO configuration found, review needed +----------------------------- +UDS Capability Issues + +❌ Error Descriptions: + - Not all pods have the istio sidecar +----------------------------- +``` + ## Environment Variables Be wary of `*config*.yaml` or `.env*` files that are in individual components of the stack. The component's README will usually tell the developer when to fill them out or supply environment variables to a script. @@ -81,6 +127,7 @@ uds zarf tools registry prune --confirm # create and deploy the new package # FLAVOR can be upstream (default) or registry1 - see README for availability details +# See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md) LOCAL_VERSION=dev FLAVOR=upstream REGISTRY_PORT=5000 ARCH=amd64 make build-api LOCAL_VERSION=dev FLAVOR=upstream REGISTRY_PORT=5000 ARCH=amd64 make deploy-api ``` @@ -107,6 +154,7 @@ uds zarf package deploy zarf-package-*.tar.zst --confirm ```bash # FLAVOR can be upstream (default) or registry1 - see README for availability details + # See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md) LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-cpu # ui, api, llama-cpp-python, text-embeddings, whisper, supabase # OR LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-gpu # ui, api, vllm, text-embeddings, whisper, supabase @@ -120,6 +168,7 @@ uds zarf package deploy zarf-package-*.tar.zst --confirm ```bash # FLAVOR can be upstream (default) or registry1 - see README for availability details + # See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md) LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-ui LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-api LOCAL_VERSION=dev FLAVOR=upstream ARCH=amd64 make build-supabase @@ -154,7 +203,7 @@ Although not provided in the example UDS bundle manifests found in this reposito - name: leapfrogai-api repository: ghcr.io/defenseunicorns/packages/leapfrogai/leapfrogai-api # x-release-please-start-version - ref: 0.12.2 + ref: 0.13.1 # x-release-please-end # THE BELOW LINES WERE ADDED FOR DEMONSTRATION PURPOSES @@ -188,6 +237,7 @@ To demonstrate what this would look like for an Apple Silicon Mac: ```bash # FLAVOR can be upstream (default) or registry1 - see README for availability details +# See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md) REG_PORT=5001 ARCH=arm64 LOCAL_VERSION=dev FLAVOR=upstream make build-cpu ``` @@ -195,6 +245,7 @@ To demonstrate what this would look like for an older Intel Mac: ```bash # FLAVOR can be upstream (default) or registry1 - see README for availability details +# See individual sub-directories for any flavor-specific instructions (e.g., packages/api/README.md) REG_PORT=5001 ARCH=arm64 LOCAL_VERSION=dev FLAVOR=upstream make build-cpu ``` diff --git a/mk-clean.mk b/mk-clean.mk index ff7e8c61d..4ca00ae89 100644 --- a/mk-clean.mk +++ b/mk-clean.mk @@ -15,8 +15,8 @@ clean-artifacts: # Zarf packages, UDS bundles, Python build artifacts, etc. clean-cache: -rm -rf ./**/__pycache__ ./**/*/__pycache__ ./**/**/*/__pycache__ - -rm -rf ./**/*/.ruff_cache ./**/.ruff_cache - -rm -rf ./**/.pytest_cache ./**/*/.pytest_cache + -rm -rf ./.ruff_cache ./**/*/.ruff_cache ./**/.ruff_cache + -rm -rf ./.pytest_cache ./**/.pytest_cache ./**/*/.pytest_cache -rm -rf ./.mypy_cache clean-env: diff --git a/packages/api/README.md b/packages/api/README.md index aa2b34690..2d68d67f8 100644 --- a/packages/api/README.md +++ b/packages/api/README.md @@ -27,6 +27,13 @@ make build-api LOCAL_VERSION=dev FLAVOR=upstream uds zarf package deploy packages/api/zarf-package-leapfrogai-api-*-dev.tar.zst --confirm ``` +For other package flavors, use the following example: + +```bash +make build-api FLAVOR=registry1 +uds zarf package deploy packages/api/zarf-package-leapfrogai-api-*-dev.tar.zst --confirm +``` + ### Local Development See the [source code documentation](../../src/leapfrogai_api/README.md) for running the API from the source code for local Python environment development. diff --git a/packages/api/chart/templates/istio-admin.yaml b/packages/api/chart/templates/istio-admin.yaml new file mode 100644 index 000000000..c369e8786 --- /dev/null +++ b/packages/api/chart/templates/istio-admin.yaml @@ -0,0 +1,24 @@ +{{- if .Capabilities.APIVersions.Has "security.istio.io/v1beta1" }} +apiVersion: security.istio.io/v1beta1 +kind: AuthorizationPolicy +metadata: + name: api-block-metrics-access-from-public-gateway + namespace: {{ .Release.Namespace }} +spec: + selector: + matchLabels: + {{- include "chart.selectorLabels" . | nindent 6 }} + action: DENY + rules: + - to: + - operation: + ports: + - "8080" + paths: + - /metrics* + from: + - source: + notNamespaces: + - istio-admin-gateway + - monitoring +{{- end }} diff --git a/packages/api/chart/templates/uds-package.yaml b/packages/api/chart/templates/uds-package.yaml index a6a83dea8..17220788d 100644 --- a/packages/api/chart/templates/uds-package.yaml +++ b/packages/api/chart/templates/uds-package.yaml @@ -7,6 +7,11 @@ metadata: labels: {{- include "chart.labels" . | nindent 4 }} spec: + monitor: + - portName: http + targetPort: {{ .Values.api.service.port }} + selector: + {{- include "chart.selectorLabels" . | nindent 8 }} network: expose: - service: {{ include "chart.fullname" . }} diff --git a/packages/api/chart/values.yaml b/packages/api/chart/values.yaml index 65b397e46..4c217ba8a 100644 --- a/packages/api/chart/values.yaml +++ b/packages/api/chart/values.yaml @@ -25,6 +25,8 @@ api: value: "*.toml" - name: DEFAULT_EMBEDDINGS_MODEL value: "text-embeddings" + - name: DEV + value: "false" - name: PORT value: "8080" - name: SUPABASE_URL diff --git a/packages/api/values/registry1-values.yaml b/packages/api/values/registry1-values.yaml index d269c6415..91f92b168 100644 --- a/packages/api/values/registry1-values.yaml +++ b/packages/api/values/registry1-values.yaml @@ -1,9 +1,7 @@ api: image: repository: "registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api" - # x-release-please-start-version - tag: v0.12.2 - # x-release-please-end + tag: v###ZARF_CONST_IMAGE_VERSION### expose: "###ZARF_VAR_EXPOSE_API###" @@ -16,6 +14,8 @@ api: value: "*.toml" - name: DEFAULT_EMBEDDINGS_MODEL value: "###ZARF_VAR_DEFAULT_EMBEDDINGS_MODEL###" + - name: DEV + value: "###ZARF_VAR_DEV###" - name: PORT value: "8080" - name: SUPABASE_URL diff --git a/packages/api/values/upstream-values.yaml b/packages/api/values/upstream-values.yaml index 6d867260e..ef2dcdad9 100644 --- a/packages/api/values/upstream-values.yaml +++ b/packages/api/values/upstream-values.yaml @@ -14,6 +14,8 @@ api: value: "*.toml" - name: DEFAULT_EMBEDDINGS_MODEL value: "###ZARF_VAR_DEFAULT_EMBEDDINGS_MODEL###" + - name: DEV + value: "###ZARF_VAR_DEV###" - name: PORT value: "8080" - name: SUPABASE_URL diff --git a/packages/api/zarf-config.yaml b/packages/api/zarf-config.yaml new file mode 100644 index 000000000..475ac2d48 --- /dev/null +++ b/packages/api/zarf-config.yaml @@ -0,0 +1,6 @@ +package: + create: + set: + # x-release-please-start-version + image_version: "0.13.1" + # x-release-please-end diff --git a/packages/api/zarf.yaml b/packages/api/zarf.yaml index 4fa6c59f2..51e0b5f38 100644 --- a/packages/api/zarf.yaml +++ b/packages/api/zarf.yaml @@ -16,6 +16,9 @@ variables: description: "Flag to expose the OpenAPI schema for debugging." - name: DEFAULT_EMBEDDINGS_MODEL default: "text-embeddings" + - name: DEV + default: "false" + description: "Flag to enable development endpoints." components: - name: leapfrogai-api @@ -47,7 +50,7 @@ components: valuesFiles: - "values/registry1-values.yaml" images: - - "registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api:v0.12.2" + - "registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api:v###ZARF_PKG_TMPL_IMAGE_VERSION###" # TODO: replace with Ironbank image once hardened: registry1.dso.mil/ironbank/opensource/defenseunicorns/leapfrogai/api/migrations - "ghcr.io/defenseunicorns/leapfrogai/api-migrations:###ZARF_PKG_TMPL_IMAGE_VERSION###" - "registry1.dso.mil/ironbank/kiwigrid/k8s-sidecar:1.23.3" diff --git a/packages/ui/chart/templates/ui/service.yaml b/packages/ui/chart/templates/ui/service.yaml index 15243e806..2cb919567 100644 --- a/packages/ui/chart/templates/ui/service.yaml +++ b/packages/ui/chart/templates/ui/service.yaml @@ -18,11 +18,3 @@ spec: protocol: TCP port: {{ .Values.service.port }} targetPort: {{ .Values.service.port }} ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "chart.serviceAccountName" . }} - namespace: {{ .Release.Namespace | default "leapfrogai" }} - labels: - {{- include "chart.labels" . | nindent 4 }} diff --git a/packages/vllm/.env.example b/packages/vllm/.env.example index 1e3a00170..0a995e234 100644 --- a/packages/vllm/.env.example +++ b/packages/vllm/.env.example @@ -1,13 +1,12 @@ -export LAI_HF_HUB_ENABLE_HF_TRANSFER="1" -export LAI_REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ" -export LAI_REVISION="gptq-4bit-32g-actorder_True" -export LAI_QUANTIZATION="gptq" -export LAI_TENSOR_PARALLEL_SIZE=1 -export LAI_MODEL_SOURCE=".model/" -export LAI_MAX_CONTEXT_LENGTH=32768 -export LAI_STOP_TOKENS='["","<|endoftext|>","<|im_end|>"]' -export LAI_PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n" -export LAI_PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n" -export LAI_PROMPT_FORMAT_CHAT_USER="USER: {}\n" -export LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 -export LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0 \ No newline at end of file +LFAI_REPO_ID="TheBloke/SynthIA-7B-v2.0-GPTQ" +LFAI_REVISION="gptq-4bit-32g-actorder_True" + +VLLM_TENSOR_PARALLEL_SIZE=1 +VLLM_TRUST_REMOTE_CODE=True +VLLM_MAX_CONTEXT_LENGTH=32768 +VLLM_ENFORCE_EAGER=False +VLLM_GPU_MEMORY_UTILIZATION=0.90 +VLLM_WORKER_USE_RAY=True +VLLM_ENGINE_USE_RAY=True +VLLM_QUANTIZATION=None +VLLM_LOAD_FORMAT=auto diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile index 8676f5eda..f53088ead 100755 --- a/packages/vllm/Dockerfile +++ b/packages/vllm/Dockerfile @@ -6,8 +6,9 @@ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS builder # set SDK location # set the pyenv and Python versions ARG SDK_DEST=src/leapfrogai_sdk/build \ - PYTHON_VERSION=3.11.6 \ - PYENV_GIT_TAG=v2.4.8 + PYTHON_VERSION=3.11.9 \ + PYENV_GIT_TAG=v2.4.8\ + COMPONENT_DIRECTORY="packages/vllm" # use root user for deps installation and nonroot user creation USER root @@ -41,7 +42,7 @@ USER nonroot # copy-in SDK from sdk stage and vllm source code from host WORKDIR /home/leapfrogai COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST} -COPY --chown=nonroot:nonroot packages/vllm packages/vllm +COPY --chown=nonroot:nonroot ${COMPONENT_DIRECTORY} packages/vllm # create virtual environment for light-weight portability and minimal libraries RUN curl https://pyenv.run | bash && \ @@ -54,10 +55,10 @@ RUN curl https://pyenv.run | bash && \ ENV PYENV_ROOT="/home/nonroot/.pyenv" \ PATH="/home/nonroot/.pyenv/bin:$PATH" -# Install Python 3.11.6, set it as global, and create a venv +# Install Python, set it as global, and create a venv RUN . ~/.bashrc && \ - PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.6 && \ - pyenv global 3.11.6 && \ + PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.9 && \ + pyenv global ${PYTHON_VERSION} && \ pyenv exec python -m venv .venv # set path to venv python @@ -67,26 +68,15 @@ RUN rm -f packages/vllm/build/*.whl && \ python -m pip wheel packages/vllm -w packages/vllm/build --find-links=${SDK_DEST} && \ pip install packages/vllm/build/lfai_vllm*.whl --no-index --find-links=packages/vllm/build/ +################# +# FINAL CONTAINER +################# + FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 # set SDK location ARG SDK_DEST=src/leapfrogai_sdk/build -# model-specific arguments -ARG ARG HF_HUB_ENABLE_HF_TRANSFER="1" \ - REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ" \ - REVISION="gptq-4bit-32g-actorder_True" \ - MODEL_SOURCE="/data/.model/" \ - MAX_CONTEXT_LENGTH=32768 \ - STOP_TOKENS='[""]' \ - PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n" \ - PROMPT_FORMAT_CHAT_USER="USER: {}\n" \ - PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n" \ - PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \ - PROMPT_FORMAT_DEFAULTS_TOP_K=0 \ - TENSOR_PARALLEL_SIZE=1 \ - QUANTIZATION="gptq" - # setup nonroot user and permissions USER root RUN groupadd -g 65532 vglusers && \ @@ -101,24 +91,10 @@ COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST} COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/.venv /home/leapfrogai/.venv COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src /home/leapfrogai/packages/vllm/src # copy-in python binaries -COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/ - -# load ARG values into env variables for pickup by confz -ENV LAI_HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER} \ - LAI_REPO_ID=${REPO_ID} \ - LAI_REVISION=${REVISION} \ - LAI_MODEL_SOURCE=${MODEL_SOURCE} \ - LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \ - LAI_STOP_TOKENS=${STOP_TOKENS} \ - LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \ - LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \ - LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \ - LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \ - LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \ - LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \ - LAI_QUANTIZATION=${QUANTIZATION} \ - # remove vLLM callback to stats server - VLLM_NO_USAGE_STATS=1 +COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/ /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/ + +# remove vLLM callback to stats server +ENV VLLM_NO_USAGE_STATS=1 ENV PATH="/home/leapfrogai/.venv/bin:$PATH" diff --git a/packages/vllm/Makefile b/packages/vllm/Makefile index 98e8b29db..c764a78f2 100644 --- a/packages/vllm/Makefile +++ b/packages/vllm/Makefile @@ -1,6 +1,27 @@ +ARCH ?= amd64 +LOCAL_VERSION ?= $(shell git rev-parse --short HEAD) +DOCKER_FLAGS := + install: python -m pip install ../../src/leapfrogai_sdk python -m pip install -e ".[dev]" -dev: - python -m leapfrogai_sdk.cli --app-dir=src/ main:Model +download: + @env $$(cat .env | xargs) python src/model_download.py + +dev: download + @env $$(cat .env | xargs) python -m leapfrogai_sdk.cli --app-dir=src/ main:Model + +docker: download + docker build ${DOCKER_FLAGS} \ + --platform=linux/${ARCH} \ + --build-arg LOCAL_VERSION=${LOCAL_VERSION} \ + --build-arg COMPONENT_DIRECTORY="./" \ + -t ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} \ + -f ./Dockerfile . + + docker run -it --rm \ + --env-file ./.env \ + -v $(PWD)/config.yaml:/home/leapfrogai/config.yaml \ + -v $(PWD)/.model:/home/leapfrogai/.model \ + ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} diff --git a/packages/vllm/README.md b/packages/vllm/README.md index a55238cfd..5bc7a052f 100644 --- a/packages/vllm/README.md +++ b/packages/vllm/README.md @@ -16,13 +16,21 @@ See the LeapfrogAI documentation website for [system requirements](https://docs. The default model that comes with this backend in this repository's officially released images is a [4-bit quantization of the Synthia-7b model](https://huggingface.co/TheBloke/SynthIA-7B-v2.0-GPTQ). -You can optionally specify different models or quantization types using the following Docker build arguments: +All of the commands in this sub-section are executed within this `packages/vllm` sub-directory. -- `--build-arg HF_HUB_ENABLE_HF_TRANSFER="1"`: Enable or disable HuggingFace Hub transfer (default: 1) -- `--build-arg REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"`: HuggingFace repository ID for the model -- `--build-arg REVISION="gptq-4bit-32g-actorder_True"`: Revision or commit hash for the model -- `--build-arg QUANTIZATION="gptq"`: Quantization type (e.g., gptq, awq, or empty for un-quantized) -- `--build-arg TENSOR_PARALLEL_SIZE="1"`: The number of gpus to spread the tensor processing across +Optionally, you can specify a different model during Zarf creation: + +```bash +uds zarf package create --confirm --set MODEL_REPO_ID=defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g --set MODEL_REVISION=main +``` + +If you decide to use a different model, there will likely be a need to change generation and engine runtime configurations, please see the [Zarf Package Config](./zarf-config.yaml) and the [values override file](./values/upstream-values.yaml) for details on what runtime parameters can be modified. These parameters are model-specific, and can be found in the HuggingFace model cards and/or configuration files (e.g., prompt templates). + +For example, during Zarf deployment, you can override the Zarf Package Config defaults by doing the following: + +```bash +uds zarf package deploy zarf-package-vllm-amd64-dev.tar.zst --confirm --set ENFORCE_EAGER=True +``` ### Deployment @@ -39,11 +47,26 @@ uds zarf package deploy packages/vllm/zarf-package-vllm-*-dev.tar.zst --confirm ### Local Development -To run the vllm backend locally: +In local development the [config.yaml](./config.yaml) and [.env.example](./.env.example) must be modified if the model has changed away from the default. The LeapfrogAI SDK picks up the `config.yaml` automatically, and the `.env` must be sourced into the Python environment. > [!IMPORTANT] > Execute the following commands from this sub-directory +Create a `.env` file based on the [`.env.example`](./.env.example): + +```bash +cp .env.example .env +source .env +``` + +As necessary, modify the existing [`config.yaml`](./config.yaml): + +```bash +vim config.yaml +``` + +To run the vllm backend locally: + ```bash # Install dev and runtime dependencies make install @@ -54,3 +77,19 @@ python src/model_download.py # Start the model backend make dev ``` + +#### Local Docker Container + +To run the Docker container, use the following Makefile commands. `LOCAL_VERSION` must be consistent across the two Make commands. + +In the root of the LeapfrogAI repository: + +```bash +LOCAL_VERSION=dev make sdk-wheel +``` + +In the root of this vLLM sub-directory: + +```bash +LOCAL_VERSION=dev make docker +``` diff --git a/packages/vllm/chart/templates/deployment.yaml b/packages/vllm/chart/templates/deployment.yaml index 7b88cc137..3f8aa0540 100644 --- a/packages/vllm/chart/templates/deployment.yaml +++ b/packages/vllm/chart/templates/deployment.yaml @@ -36,7 +36,7 @@ spec: [ "sh", "-c", - 'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"', + 'while [ ! -f ###ZARF_CONST_MODEL_PATH###/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"', ] resources: {{- toYaml .Values.modelInjectionContainer.resources | nindent 12 }} @@ -46,6 +46,9 @@ spec: - name: leapfrogai-pv-storage persistentVolumeClaim: claimName: lfai-{{ .Values.nameOverride }}-pv-claim + - name: leapfrogai-sdk-configmap + configMap: + name: "{{ .Values.nameOverride }}-sdk-configmap" securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: @@ -58,6 +61,9 @@ spec: env: {{- toYaml . | nindent 12 }} {{- end }} + envFrom: + - configMapRef: + name: "{{ .Values.nameOverride }}-engine-configmap" ports: - name: http containerPort: {{ .Values.service.port }} @@ -67,6 +73,10 @@ spec: volumeMounts: - name: leapfrogai-pv-storage mountPath: "/data" + - name: leapfrogai-sdk-configmap + mountPath: "/home/leapfrogai/config.yaml" + subPath: "config.yaml" + readOnly: true {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml new file mode 100644 index 000000000..cdc08be5e --- /dev/null +++ b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Values.nameOverride }}-sdk-configmap" + namespace: {{ .Release.Namespace | default "leapfrogai" }} +data: + config.yaml: | + model: + source: {{ .Values.leapfrogaiConfig.model.source | quote }} + max_context_length: {{ .Values.leapfrogaiConfig.maxContextLength | quote }} + stop_tokens: + {{- $stopTokens := .Values.leapfrogaiConfig.stopTokens }} + {{- range $stopToken := splitList ", " .Values.leapfrogaiConfig.stopTokens }} + - {{ printf "%s" $stopToken }} + {{- end }} + prompt_format: + {{- with .Values.leapfrogaiConfig.promptFormat.chat }} + chat: + {{- if .system }} + system: {{ .system | quote }} + {{- end }} + {{- if .assistant }} + assistant: {{ .assistant | quote }} + {{- end }} + {{- if .user }} + user: {{ .user | quote }} + {{- end }} + {{- if .function }} + function: {{ .function | quote }} + {{- end }} + {{- end }} + defaults: + temperature: {{ .Values.leapfrogaiConfig.defaults.temperature | quote }} + top_p: {{ .Values.leapfrogaiConfig.defaults.topP | quote }} + top_k: {{ .Values.leapfrogaiConfig.defaults.topK | quote }} + repetition_penalty: {{ .Values.leapfrogaiConfig.defaults.repetitionPenalty | quote }} + max_new_tokens: {{ .Values.leapfrogaiConfig.defaults.maxNewTokens | quote }} diff --git a/packages/vllm/chart/templates/vllm-engine-configmap.yaml b/packages/vllm/chart/templates/vllm-engine-configmap.yaml new file mode 100644 index 000000000..5ac82b42c --- /dev/null +++ b/packages/vllm/chart/templates/vllm-engine-configmap.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Values.nameOverride }}-engine-configmap" + namespace: {{ .Release.Namespace | default "leapfrogai" }} +data: + VLLM_TRUST_REMOTE_CODE: "{{ .Values.vllmConfig.trustRemoteCode }}" + VLLM_TENSOR_PARALLEL_SIZE: "{{ .Values.vllmConfig.tensorParallelSize }}" + VLLM_ENFORCE_EAGER: "{{ .Values.vllmConfig.enforceEager }}" + VLLM_GPU_MEMORY_UTILIZATION: "{{ .Values.vllmConfig.gpuMemoryUtilization }}" + VLLM_WORKER_USE_RAY: "{{ .Values.vllmConfig.workerUseRay }}" + VLLM_ENGINE_USE_RAY: "{{ .Values.vllmConfig.engineUseRay }}" + VLLM_QUANTIZATION: "{{ .Values.vllmConfig.quantization }}" + VLLM_LOAD_FORMAT: "{{ .Values.vllmConfig.loadFormat }}" diff --git a/packages/vllm/chart/values.yaml b/packages/vllm/chart/values.yaml index 0f7fe9911..0209a8b34 100644 --- a/packages/vllm/chart/values.yaml +++ b/packages/vllm/chart/values.yaml @@ -13,6 +13,33 @@ image: nameOverride: "vllm" fullnameOverride: "" +leapfrogaiConfig: + model: + source: "/data/.model/" + maxContextLength: "32768" + stopTokens: ", <|im_end|>, <|endoftext|>" + promptFormat: + chat: + system: "SYSTEM: {}\n" + assistant: "ASSISTANT: {}\n" + user: "USER: {}\n" + defaults: + temperature: "0.1" + topP: "1.0" + topK: "0" + repetitionPenalty: "1.0" + maxNewTokens: "8192" + +vllmConfig: + trustRemoteCode: "True" + tensorParallelSize: "1" + enforceEager: "False" + gpuMemoryUtilization: "0.90" + workerUseRay: "True" + engineUseRay: "True" + quantization: "None" + loadFormat: "auto" + env: - name: LFAI_LOG_LEVEL value: "INFO" @@ -41,7 +68,7 @@ resources: limits: cpu: 0 memory: 0 - nvidia.com/gpu: 0 + nvidia.com/gpu: 1 requests: cpu: 0 memory: 0 diff --git a/packages/vllm/config.yaml b/packages/vllm/config.yaml new file mode 100644 index 000000000..22210a74b --- /dev/null +++ b/packages/vllm/config.yaml @@ -0,0 +1,17 @@ +model: + source: ".model/" +max_context_length: 32768 +stop_tokens: + - "<|im_end|>" + - "<|endoftext|>" + - "" +prompt_format: + chat: + system: "SYSTEM: {}\n" + assistant: "ASSISTANT: {}\n" + user: "USER: {}\n" +defaults: + top_p: 1.0 + top_k: 0 + repetition_penalty: 1.0 + max_new_tokens: 8192 diff --git a/packages/vllm/pyproject.toml b/packages/vllm/pyproject.toml index 4d7955708..24b1363e6 100644 --- a/packages/vllm/pyproject.toml +++ b/packages/vllm/pyproject.toml @@ -8,7 +8,7 @@ version = "0.13.1" dependencies = [ "pydantic == 2.8.2", - "vllm == 0.4.2", + "vllm == 0.4.3", "python-dotenv == 1.0.1", "aiostream ==0.6.2", "leapfrogai-sdk", diff --git a/packages/vllm/src/config.py b/packages/vllm/src/config.py index debca4ba3..c13af5521 100644 --- a/packages/vllm/src/config.py +++ b/packages/vllm/src/config.py @@ -5,10 +5,6 @@ class ConfigOptions(BaseConfig): - quantization: Literal[None, "awq", "gptq", "squeezellm"] = Field( - default=None, - description="Type of quantization, for un-quantized models omit this field", - ) tensor_parallel_size: int = Field( default=1, title="GPU Utilization Count", @@ -16,39 +12,105 @@ class ConfigOptions(BaseConfig): "This must be divisible to the number of attention heads in the model", examples=[1, 2, 3], ) + quantization: Literal[ + "aqlm", + "bitsandbytes", + "awq", + "deepspeedfp", + "fp8", + "marlin", + "gptq_marlin_24", + "gptq_marlin", + "gptq", + "squeezellm", + "sparseml", + "None", + "", + ] = Field( + title="quantization", + description="Quantization type of the model" + "Force GPTQ instead of GPTQ_Marlin by explicitly providing `gptq` as value.", + examples=["awq", "fp8", "gptq_marlin", "gptq", "squeezellm", "None"], + ) + load_format: Literal["auto", "safetensors", "npz", "pt", "bitsandbytes"] = Field( + title="quantization", + description="Load format for the type model and files", + examples=["auto", "safetensors", "npz", "pt", "bitsandbytes"], + ) + enforce_eager: bool = Field( + title="Enable Eager Mode", + description="Enable eager mode to start token generation immediately after prompt processing." + "Potentially reduces initial latency at the cost of slightly higher memory usage." + "Should be set to False in production environments with higher GPU memory.", + examples=[True, False], + ) + gpu_memory_utilization: float = Field( + title="GPU Memory Limit", + description="Maximum amount of GPU vRAM allocated to the vLLM engine and worker(s)", + examples=[0.50, 0.80, 0.90], + ) + engine_use_ray: bool = Field( + title="Use Ray for Engine", + description="If True, uses Ray for managing the execution engine. Allows for distributed inferencing in multi-node situations.", + examples=[True, False], + ) + worker_use_ray: bool = Field( + title="Use Ray for Worker", + description="If True, uses Ray for distributed worker management. Allows for distributed inferencing in multi-node situations.", + examples=[True, False], + ) + trust_remote_code: bool = Field( + title="Trust Downloaded Model Code", + description="Whether to trust inferencing code downloaded as part of the model download." + "Please review the Python code in the .model/ directory before trusting custom model code.", + examples=[True, False], + ) class DownloadOptions(BaseConfig): - hf_hub_enable_hf_transfer: Literal["0", "1"] = Field( - description="Option (0 - Disable, 1 - Enable) for faster transfers, tradeoff stability for faster speeds" - ) repo_id: str = Field( - description="HuggingFace repo id", + description="The HuggingFace git repository ID", examples=[ - "TheBloke/Synthia-7B-v2.0-GPTQ", - "migtissera/Synthia-MoE-v3-Mixtral-8x7B", - "microsoft/phi-2", + "defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g", + "justinthelaw/Phi-3-mini-128k-instruct-4bit-128g", ], ) revision: str = Field( - description="The model branch to use", + description="The HuggingFace repository git branch to use", examples=["main", "gptq-4bit-64g-actorder_True"], ) +# vLLM specific runtime configuration options class AppConfig(BaseConfig): backend_options: ConfigOptions + CONFIG_SOURCES = [ + EnvSource( + allow_all=True, + prefix="VLLM_", + remap={ + "tensor_parallel_size": "backend_options.tensor_parallel_size", + "trust_remote_code": "backend_options.trust_remote_code", + "enforce_eager": "backend_options.enforce_eager", + "quantization": "backend_options.quantization", + "gpu_memory_utilization": "backend_options.gpu_memory_utilization", + "worker_use_ray": "backend_options.worker_use_ray", + "engine_use_ray": "backend_options.engine_use_ray", + "load_format": "backend_options.load_format", + }, + ) + ] + + +class DownloadConfig(BaseConfig): download_options: Optional[DownloadOptions] CONFIG_SOURCES = [ EnvSource( allow_all=True, - prefix="LAI_", + prefix="LFAI_", remap={ - "hf_hub_enable_hf_transfer": "download_options.hf_hub_enable_hf_transfer", "repo_id": "download_options.repo_id", "revision": "download_options.revision", - "quantization": "backend_options.quantization", - "tensor_parallel_size": "backend_options.tensor_parallel_size", }, ) ] diff --git a/packages/vllm/src/main.py b/packages/vllm/src/main.py index 6a530e4f0..67d36d178 100644 --- a/packages/vllm/src/main.py +++ b/packages/vllm/src/main.py @@ -1,15 +1,12 @@ import asyncio -import json import logging import os import queue import random -import sys import threading import time from typing import Any, Dict, AsyncGenerator -from confz import EnvSource from dotenv import load_dotenv from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs @@ -18,15 +15,8 @@ from vllm.utils import random_uuid from config import AppConfig -from leapfrogai_sdk import ( - BackendConfig, - ChatCompletionRequest, - CompletionRequest, -) -from leapfrogai_sdk.llm import ( - GenerationConfig, - LLM, -) +from leapfrogai_sdk import BackendConfig +from leapfrogai_sdk.llm import GenerationConfig, LLM load_dotenv() @@ -84,60 +74,6 @@ def remove_iterator(self, async_iterable): pass # If the iterable is not found, ignore the error -def get_backend_configs(): - # Manually load env var as ConfZ does not handle complex types (list) - stop_tokens: str | None = os.getenv("LAI_STOP_TOKENS") - if stop_tokens: - processed_stop_tokens = json.loads(stop_tokens) - else: - processed_stop_tokens = [] - del os.environ["LAI_STOP_TOKENS"] - - env_source = EnvSource( - allow_all=True, - prefix="LAI_", - remap={ - "model_source": "model.source", - "max_context_length": "max_context_length", - "stop_tokens": "stop_tokens", - "prompt_format_chat_system": "prompt_format.chat.system", - "prompt_format_chat_assistant": "prompt_format.chat.assistant", - "prompt_format_chat_user": "prompt_format.chat.user", - "prompt_format_defaults_top_p": "prompt_format.defaults.top_p", - "prompt_format_defaults_top_k": "prompt_format.defaults.top_k", - }, - ) - - BackendConfig.CONFIG_SOURCES = env_source - # Initialize an immutable config from env variables without stop_tokens list - backend_configs: BackendConfig = BackendConfig() - # Updates "processed_stop_tokens" without triggering Pydantic validation errors - backend_configs.model_copy(update={"stop_tokens": processed_stop_tokens}) - - return backend_configs - - -def get_config_from_request(request: ChatCompletionRequest | CompletionRequest): - return GenerationConfig( - max_new_tokens=request.max_new_tokens, - temperature=request.temperature, - top_k=request.top_k, - top_p=request.top_p, - do_sample=request.do_sample, - n=request.n, - stop=list(request.stop), - repetition_penalty=request.repetition_penalty, - presence_penalty=request.presence_penalty, - best_of=str(request.best_of), - logit_bias=request.logit_bias, - return_full_text=request.return_full_text, - truncate=request.truncate, - typical_p=request.typical_p, - watermark=request.watermark, - seed=request.seed, - ) - - @LLM class Model: """Implements an LLM model with concurrent output generation and management.""" @@ -152,19 +88,26 @@ def __init__(self): _thread = threading.Thread(target=asyncio.run, args=(self.iterate_outputs(),)) _thread.start() - self.backend_config = get_backend_configs() - self.model = self.backend_config.model.source + quantization = ( + None + if AppConfig().backend_options.quantization in ["", "None"] + else AppConfig().backend_options.quantization + ) + self.engine_args = AsyncEngineArgs( - engine_use_ray=True, - model=self.model, - trust_remote_code=False, - quantization=AppConfig().backend_options.quantization, - max_seq_len_to_capture=self.backend_config.max_context_length, - max_model_len=self.backend_config.max_context_length, - dtype="auto", - worker_use_ray=True, - gpu_memory_utilization=0.90, + # Taken from the LFAI SDK general LLM configuration + model=BackendConfig().model.source, + max_seq_len_to_capture=BackendConfig().max_context_length, + max_model_len=BackendConfig().max_context_length, + # Taken from the vLLM-specific configuration + enforce_eager=AppConfig().backend_options.enforce_eager, + quantization=quantization, + load_format=AppConfig().backend_options.load_format, tensor_parallel_size=AppConfig().backend_options.tensor_parallel_size, + engine_use_ray=AppConfig().backend_options.engine_use_ray, + worker_use_ray=AppConfig().backend_options.worker_use_ray, + gpu_memory_utilization=AppConfig().backend_options.gpu_memory_utilization, + trust_remote_code=AppConfig().backend_options.trust_remote_code, ) self.engine = AsyncLLMEngine.from_engine_args(self.engine_args) print(self.engine_args) @@ -228,18 +171,39 @@ async def create_response( """Initiate a response generation for the given prompt and configuration, adding the result to the iterator pool.""" - sampling_params = SamplingParams( - temperature=config.temperature, - # Clamp top_p value to prevent float errors - top_p=clamp(config.top_p, 0.0 + sys.float_info.epsilon, 1.0), - # Restrict top_k to valid values, -1 disables top_k - top_k=config.top_k if config.top_k >= 1 else -1, - stop=self.backend_config.stop_tokens, - max_tokens=config.max_new_tokens, - skip_special_tokens=False, - ) + # Collect LeapfrogAI SDK-defined parameters not aligned with vLLM SamplingParams + params = { + "max_tokens": getattr(config, "max_new_tokens"), + } + + # Collect LeapfrogAI SDK-defined parameters directly aligned with vLLM SamplingParams + aligned_params = [ + "temperature", + "top_p", + "top_k", + "stop", + "n", + "repetition_penalty", + "presence_penalty", + "best_of", + "logit_bias", + "return_full_text", + "truncate", + "typical_p", + "seed", + ] + + # Add only the parameters that exist in the request + # vLLM will provide defaults for the rest, if not specified + for param in aligned_params: + if param in config: + params[param] = config[param] + + # Pass the collected params to vLLM SamplingParams + sampling_params = SamplingParams(**params) + logger.info(f"Begin generation for request {request_id}") - logger.debug(f"{request_id} sampling_paramms: {sampling_params}") + logger.debug(f"{request_id} sampling_params: {sampling_params}") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. @@ -284,8 +248,12 @@ async def generate( request_id ): result = "" - if not self.is_queue_empty(request_id): - result = self.delta_queue_by_id.get(request_id).get() + + # Ensure that the queue is not None and contains items before calling .get() + cur_queue = self.delta_queue_by_id.get(request_id) + if cur_queue is not None and not cur_queue.empty(): + result = cur_queue.get() + yield result logger.info(f"Finished request {request_id}") diff --git a/packages/vllm/src/model_download.py b/packages/vllm/src/model_download.py index 29f88942c..b87b6a61e 100644 --- a/packages/vllm/src/model_download.py +++ b/packages/vllm/src/model_download.py @@ -1,18 +1,17 @@ import os from huggingface_hub import snapshot_download -from config import AppConfig +from config import DownloadConfig -REPO_ID = AppConfig().download_options.repo_id -REVISION = AppConfig().download_options.revision -os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = ( - AppConfig().download_options.hf_hub_enable_hf_transfer -) +REPO_ID = DownloadConfig().download_options.repo_id +REVISION = DownloadConfig().download_options.revision + +# enable hf_transfer to max-out model download bandwidth +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" print(f"Downloading model from {REPO_ID} at revision {REVISION}...") snapshot_download( repo_id=REPO_ID, local_dir=".model", - local_dir_use_symlinks=False, revision=REVISION, ) diff --git a/packages/vllm/values/upstream-values.yaml b/packages/vllm/values/upstream-values.yaml index 0fe581bdd..e74ebec4a 100644 --- a/packages/vllm/values/upstream-values.yaml +++ b/packages/vllm/values/upstream-values.yaml @@ -2,12 +2,55 @@ image: repository: "ghcr.io/defenseunicorns/leapfrogai/vllm" tag: "###ZARF_CONST_IMAGE_VERSION###" +nameOverride: "###ZARF_CONST_NAME_OVERRIDE###" + +leapfrogaiConfig: + model: + source: "###ZARF_CONST_MODEL_PATH###" + maxContextLength: "###ZARF_VAR_MAX_CONTEXT_LENGTH###" + stopTokens: "###ZARF_VAR_STOP_TOKENS###" + promptFormat: + chat: + system: "###ZARF_VAR_PROMPT_FORMAT_CHAT_SYSTEM###" + assistant: "###ZARF_VAR_PROMPT_FORMAT_CHAT_ASSISTANT###" + user: "###ZARF_VAR_PROMPT_FORMAT_CHAT_USER###" + defaults: + temperature: "###ZARF_VAR_TEMPERATURE###" + topP: "###ZARF_VAR_TOP_P###" + topK: "###ZARF_VAR_TOP_K###" + repetitionPenalty: "###ZARF_VAR_REPETITION_PENALTY###" + maxNewTokens: "###ZARF_VAR_MAX_NEW_TOKENS###" + + +vllmConfig: + trustRemoteCode: "###ZARF_VAR_TRUST_REMOTE_CODE###" + tensorParallelSize: "###ZARF_VAR_TENSOR_PARALLEL_SIZE###" + enforceEager: "###ZARF_VAR_ENFORCE_EAGER###" + gpuMemoryUtilization: "###ZARF_VAR_GPU_MEMORY_UTILIZATION###" + workerUseRay: "###ZARF_VAR_WORKER_USE_RAY###" + engineUseRay: "###ZARF_VAR_ENGINE_USE_RAY###" + quantization: "###ZARF_VAR_QUANTIZATION###" + loadFormat: "###ZARF_VAR_LOAD_FORMAT###" + +env: + - name: LFAI_LOG_LEVEL + value: "INFO" + gpu: runtimeClassName: "###ZARF_VAR_GPU_RUNTIME###" resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. limits: + cpu: 0 + memory: 0 nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###" + requests: + cpu: 0 + memory: 0 persistence: size: "###ZARF_VAR_PVC_SIZE###" diff --git a/packages/vllm/zarf-config.yaml b/packages/vllm/zarf-config.yaml new file mode 100644 index 000000000..5f032eecb --- /dev/null +++ b/packages/vllm/zarf-config.yaml @@ -0,0 +1,39 @@ +package: + create: + set: + # x-release-please-start-version + image_version: "0.13.0" + # x-release-please-end + + model_repo_id: "TheBloke/Synthia-7B-v2.0-GPTQ" + model_revision: "gptq-4bit-32g-actorder_True" + model_path: "/data/.model/" + name_override: "vllm" + deploy: + set: + # vLLM runtime configuration (usually influenced by .env in local development) + trust_remote_code: "True" + tensor_parallel_size: "1" + enforce_eager: "False" + gpu_memory_utilization: "0.90" + worker_use_ray: "True" + engine_use_ray: "True" + quantization: "None" + load_format: "auto" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + max_context_length: "32768" + stop_tokens: ", <|im_end|>, <|endoftext|>" + prompt_format_chat_system: "SYSTEM: {}\n" + prompt_format_chat_user: "USER: {}\n" + prompt_format_chat_assistant: "ASSISTANT: {}\n" + temperature: "0.1" + top_p: "1.0" + top_k: "0" + repetition_penalty: "1.0" + max_new_tokens: "8192" + # Pod deployment configuration + gpu_limit: "1" + gpu_runtime: "nvidia" + pvc_size: "15Gi" + pvc_access_mode: "ReadWriteOnce" + pvc_storage_class: "local-path" diff --git a/packages/vllm/zarf.yaml b/packages/vllm/zarf.yaml index ed88c2f18..5e1733d17 100644 --- a/packages/vllm/zarf.yaml +++ b/packages/vllm/zarf.yaml @@ -9,27 +9,86 @@ metadata: constants: - name: IMAGE_VERSION value: "###ZARF_PKG_TMPL_IMAGE_VERSION###" + - name: MODEL_REPO_ID + description: "The HuggingFace repository ID" + value: "###ZARF_PKG_TMPL_MODEL_REPO_ID###" + - name: MODEL_REVISION + description: "The HuggingFace git branch or commit hash" + value: "###ZARF_PKG_TMPL_MODEL_REVISION###" + - name: MODEL_PATH + description: "Defines the location of the Zarf Injected model files in the vLLM container" + value: "###ZARF_PKG_TMPL_MODEL_PATH###" + - name: NAME_OVERRIDE + description: "Provide an override for the name of the deployment (e.g., the model name)" + value: "###ZARF_PKG_TMPL_NAME_OVERRIDE###" variables: + # vLLM runtime configuration (usually influenced by .env in local development) + - name: TRUST_REMOTE_CODE + description: "If True, allows the execution of code within the model files directory" + pattern: "^(True|False)$" + - name: TENSOR_PARALLEL_SIZE + description: "The number of tensor parallelism splits, typically used for model parallelism across GPUs" + pattern: "^[1-9][0-9]*$" + - name: ENFORCE_EAGER + description: "If set to True, enforces eager execution mode instead of lazy execution, impacting performance" + pattern: "^(True|False)$" + - name: GPU_MEMORY_UTILIZATION + description: "The fraction of GPU memory to be utilized, expressed as a decimal value between 0.01 and 0.99" + pattern: ^0\.(0[1-9]|[1-9][0-9])$ + - name: WORKER_USE_RAY + description: "If True, uses Ray for distributed worker management" + pattern: "^(True|False)$" + - name: ENGINE_USE_RAY + description: "If True, uses Ray for managing the execution engine" + pattern: "^(True|False)$" + - name: QUANTIZATION + description: "If None, allows vLLM to automatically detect via model files and configuration" + - name: LOAD_FORMAT + description: "If auto, allows vLLM to automatically detect via model files and configuration" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + - name: MAX_CONTEXT_LENGTH + description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used" + pattern: "^[1-9][0-9]*$" + - name: STOP_TOKENS + description: "A set of special tokens that signal the model to stop producing further output, delimited using a comma and space" + pattern: ^(<[^,]+>\s*,\s*)*<[^,]+>\s*$ + - name: PROMPT_FORMAT_CHAT_SYSTEM + description: "Prompt template format for the LeapfrogAI SDK to consume and wrap" + - name: PROMPT_FORMAT_CHAT_USER + description: "Prompt template format for the LeapfrogAI SDK to consume and wrap" + - name: PROMPT_FORMAT_CHAT_ASSISTANT + description: "Prompt template format for the LeapfrogAI SDK to consume and wrap" + - name: TEMPERATURE + description: "Controls the randomness of the model's output" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + - name: TOP_P + description: "The cumulative probability threshold for token sampling, where 1.0 represents no restriction" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + - name: TOP_K + description: "The number of top-K tokens to consider during sampling, where 0 disables top-K sampling" + pattern: ^\d+$ + - name: REPETITION_PENALTY + description: "The penalty value for repetition in generation" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + - name: MAX_NEW_TOKENS + description: "Maximum new tokens to generate" + pattern: ^\d+$ + # Pod deployment configuration - name: GPU_LIMIT - description: The GPU limit for the model inferencing. Must be 1 or more. - default: "1" + description: "The GPU limit for the model inferencing. Must be 1 or more." pattern: "^[1-9][0-9]*$" - name: GPU_RUNTIME - description: The GPU runtime name for the model inferencing. - default: "nvidia" + description: "The GPU runtime name for the model inferencing." pattern: "^(nvidia)?$" - name: PVC_SIZE - description: Size of the PVC used for model storage. - default: "15Gi" + description: "Size of the PVC used for model storage." pattern: "^[0-9]+[a-zA-Z]+$" - name: PVC_ACCESS_MODE - description: Access mode of the PVC used for model storage. - default: "ReadWriteOnce" + description: "Access mode of the PVC used for model storage." pattern: "^(ReadWriteOnce|ReadOnlyMany|ReadWriteMany)$" - name: PVC_STORAGE_CLASS - description: Storage class of the PVC used for model storage. - default: "local-path" + description: "Storage class of the PVC used for model storage." components: - name: vllm-model @@ -37,33 +96,33 @@ components: only: flavor: upstream charts: - - name: vllm-model + - name: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model" namespace: leapfrogai localPath: chart - releaseName: vllm-model + releaseName: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model" # x-release-please-start-version version: 0.13.1 # x-release-please-end valuesFiles: - "values/upstream-values.yaml" images: - - ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION### - - cgr.dev/chainguard/bash:latest + - "ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###" + - "cgr.dev/chainguard/bash:latest" dataInjections: - - source: .model/ + # location where locally downloaded model files are located + - source: ".model/" target: - namespace: leapfrogai - selector: app=lfai-vllm - container: data-loader - path: /data/.model + namespace: "leapfrogai" + selector: "app=lfai-###ZARF_PKG_TMPL_NAME_OVERRIDE###" + container: "data-loader" + # location in the container for injection of the model files + path: "###ZARF_PKG_TMPL_MODEL_PATH###" compress: true actions: onCreate: before: # NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed - - cmd: python src/model_download.py + - cmd: "python src/model_download.py" env: - - LAI_REPO_ID=TheBloke/Synthia-7B-v2.0-GPTQ - - LAI_REVISION=gptq-4bit-32g-actorder_True - - LAI_QUANTIZATION=gptq - - LAI_HF_HUB_ENABLE_HF_TRANSFER=1 + - LFAI_REPO_ID=###ZARF_PKG_TMPL_MODEL_REPO_ID### + - LFAI_REVISION=###ZARF_PKG_TMPL_MODEL_REVISION### diff --git a/packages/whisper/Dockerfile b/packages/whisper/Dockerfile index b3bed054a..a5513e9fa 100644 --- a/packages/whisper/Dockerfile +++ b/packages/whisper/Dockerfile @@ -37,8 +37,8 @@ COPY --from=builder /leapfrogai/.venv/ /leapfrogai/.venv/ # set the path to the cuda 11.8 dependencies ENV LD_LIBRARY_PATH \ - /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\ - /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib +/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\ +/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib COPY packages/whisper/main.py . diff --git a/src/leapfrogai_api/README.md b/src/leapfrogai_api/README.md index eec4dd0c6..214c986a9 100644 --- a/src/leapfrogai_api/README.md +++ b/src/leapfrogai_api/README.md @@ -56,3 +56,72 @@ See the ["Access" section of the DEVELOPMENT.md](../../docs/DEVELOPMENT.md#acces ### Tests See the [tests directory documentation](../../tests/README.md) for more details. + +### Reranking Configuration + +The LeapfrogAI API includes a Retrieval Augmented Generation (RAG) pipeline for enhanced question answering. This section details how to configure its reranking options. All RAG configurations are managed through the `/leapfrogai/v1/rag/configure` API endpoint. + +#### 1. Enabling/Disabling Reranking + +Reranking improves the accuracy and relevance of RAG responses. You can enable or disable it using the `enable_reranking` parameter: + +* **Enable Reranking:** Send a PATCH request to `/leapfrogai/v1/rag/configure` with the following JSON payload: + +```json +{ + "enable_reranking": true +} +``` + +* **Disable Reranking:** Send a PATCH request with: + +```json +{ + "enable_reranking": false +} +``` + +#### 2. Selecting a Reranking Model + +Multiple reranking models are supported, each offering different performance characteristics. Choose your preferred model using the `ranking_model` parameter. Ensure you've installed any necessary Python dependencies for your chosen model (see the [rerankers library documentation](https://github.com/AnswerDotAI/rerankers) on dependencies). + +* **Supported Models:** The system supports several models, including (but not limited to) `flashrank`, `rankllm`, `cross-encoder`, and `colbert`. Refer to the [rerankers library documentation](https://github.com/AnswerDotAI/rerankers) for a complete list and details on their capabilities. + +* **Model Selection:** Use a PATCH request to `/leapfrogai/v1/rag/configure` with the desired model: + +```json +{ + "enable_reranking": true, // Reranking must be enabled + "ranking_model": "rankllm" // Or another supported model +} +``` + +#### 3. Adjusting the Number of Results Before Reranking (`rag_top_k_when_reranking`) + +This parameter sets the number of top results retrieved from the vector database *before* the reranking process begins. A higher value increases the diversity of candidates considered for reranking but also increases processing time. A lower value can lead to missing relevant results if not carefully chosen. This setting is only relevant when reranking is enabled. + +* **Configuration:** Use a PATCH request to `/leapfrogai/v1/rag/configure` to set this value: + +```json +{ + "enable_reranking": true, + "ranking_model": "flashrank", + "rag_top_k_when_reranking": 150 // Adjust this value as needed +} +``` + +#### 4. Retrieving the Current RAG Configuration + +To check the current RAG configuration (including reranking status, model, and `rag_top_k_when_reranking`), send a GET request to `/leapfrogai/v1/rag/configure`. The response will be a JSON object containing all the current settings. + +#### 5. Example Configuration Flow + +1. **Initial Setup:** Start with reranking enabled using the default `flashrank` model and a `rag_top_k_when_reranking` value of 100. + +2. **Experiment with Models:** Test different reranking models (`rankllm`, `colbert`, etc.) by changing the `ranking_model` parameter and observing the impact on response quality. Adjust `rag_top_k_when_reranking` as needed to find the optimal balance between diversity and performance. + +3. **Fine-tuning:** Once you identify a suitable model, fine-tune the `rag_top_k_when_reranking` parameter for optimal performance. Monitor response times and quality to determine the best setting. + +4. **Disabling Reranking:** If needed, disable reranking by setting `"enable_reranking": false`. + +Remember to always consult the [rerankers library documentation](https://github.com/AnswerDotAI/rerankers) for information on supported models and their specific requirements. The API documentation provides further details on request formats and potential error responses. diff --git a/src/leapfrogai_api/backend/grpc_client.py b/src/leapfrogai_api/backend/grpc_client.py index f9082fdc2..9d18d2951 100644 --- a/src/leapfrogai_api/backend/grpc_client.py +++ b/src/leapfrogai_api/backend/grpc_client.py @@ -63,7 +63,7 @@ async def completion(model: Model, request: lfai.CompletionRequest): CompletionChoice( index=0, text=response.choices[0].text, - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), logprobs=None, ) ], @@ -122,7 +122,7 @@ async def chat_completion(model: Model, request: lfai.ChatCompletionRequest): ).lower(), content=response.choices[0].chat_item.content, ), - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), ) ], usage=Usage( diff --git a/src/leapfrogai_api/backend/helpers.py b/src/leapfrogai_api/backend/helpers.py index 65a2fd0b5..005111601 100644 --- a/src/leapfrogai_api/backend/helpers.py +++ b/src/leapfrogai_api/backend/helpers.py @@ -39,7 +39,7 @@ async def recv_completion( index=0, text=c.choices[0].text, logprobs=None, - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), ) ], usage=Usage( @@ -77,7 +77,7 @@ async def recv_chat( delta=ChatDelta( role="assistant", content=c.choices[0].chat_item.content ), - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), ) ], usage=Usage( diff --git a/src/leapfrogai_api/backend/rag/query.py b/src/leapfrogai_api/backend/rag/query.py index e5e0decce..bd0ae9bf6 100644 --- a/src/leapfrogai_api/backend/rag/query.py +++ b/src/leapfrogai_api/backend/rag/query.py @@ -1,11 +1,15 @@ """Service for querying the RAG model.""" +from rerankers.results import RankedResults from supabase import AClient as AsyncClient from langchain_core.embeddings import Embeddings from leapfrogai_api.backend.rag.leapfrogai_embeddings import LeapfrogAIEmbeddings from leapfrogai_api.data.crud_vector_content import CRUDVectorContent -from leapfrogai_api.typedef.vectorstores.search_types import SearchResponse +from leapfrogai_api.typedef.rag.rag_types import ConfigurationSingleton +from leapfrogai_api.typedef.vectorstores.search_types import SearchResponse, SearchItem from leapfrogai_api.backend.constants import TOP_K +from leapfrogai_api.utils.logging_tools import logger +from rerankers import Reranker # Allows for overwriting type of embeddings that will be instantiated embeddings_type: type[Embeddings] | type[LeapfrogAIEmbeddings] | None = ( @@ -22,7 +26,10 @@ def __init__(self, db: AsyncClient) -> None: self.embeddings = embeddings_type() async def query_rag( - self, query: str, vector_store_id: str, k: int = TOP_K + self, + query: str, + vector_store_id: str, + k: int = TOP_K, ) -> SearchResponse: """ Query the Vector Store. @@ -36,11 +43,70 @@ async def query_rag( SearchResponse: The search response from the vector store. """ + logger.debug("Beginning RAG query...") + # 1. Embed query vector = await self.embeddings.aembed_query(query) # 2. Perform similarity search + _k: int = k + if ConfigurationSingleton.get_instance().enable_reranking: + """Use the user specified top-k value unless reranking. + When reranking, use the reranking top-k value to get the initial results. + Then filter the list down later to just the k that the user has requested after reranking.""" + _k = ConfigurationSingleton.get_instance().rag_top_k_when_reranking + crud_vector_content = CRUDVectorContent(db=self.db) - return await crud_vector_content.similarity_search( - query=vector, vector_store_id=vector_store_id, k=k + results = await crud_vector_content.similarity_search( + query=vector, vector_store_id=vector_store_id, k=_k ) + + # 3. Rerank results + if ( + ConfigurationSingleton.get_instance().enable_reranking + and len(results.data) > 0 + ): + ranker = Reranker(ConfigurationSingleton.get_instance().ranking_model) + ranked_results: RankedResults = ranker.rank( + query=query, + docs=[result.content for result in results.data], + doc_ids=[result.id for result in results.data], + ) + results = rerank_search_response(results, ranked_results) + # Narrow down the results to the top-k value specified by the user + results.data = results.data[0:k] + + logger.debug("Ending RAG query...") + + return results + + +def rerank_search_response( + original_response: SearchResponse, ranked_results: RankedResults +) -> SearchResponse: + """ + Reorder the SearchResponse based on reranked results. + + Args: + original_response (SearchResponse): The original search response. + ranked_results (List[str]): List of ranked content strings. + + Returns: + SearchResponse: A new SearchResponse with reordered items. + """ + # Create a mapping of id to original SearchItem + content_to_item = {item.id: item for item in original_response.data} + + # Create new SearchItems based on reranked results + ranked_items = [] + for content in ranked_results.results: + if content.document.doc_id in content_to_item: + item: SearchItem = content_to_item[content.document.doc_id] + item.rank = content.rank + item.score = content.score + ranked_items.append(item) + + ranked_response = SearchResponse(data=ranked_items) + + # Create a new SearchResponse with reranked items + return ranked_response diff --git a/src/leapfrogai_api/main.py b/src/leapfrogai_api/main.py index 85822f7f3..108ccd51e 100644 --- a/src/leapfrogai_api/main.py +++ b/src/leapfrogai_api/main.py @@ -8,12 +8,13 @@ from fastapi import FastAPI from fastapi.exception_handlers import request_validation_exception_handler from fastapi.exceptions import RequestValidationError - +from fastapi.responses import RedirectResponse from leapfrogai_api.routers.base import router as base_router from leapfrogai_api.routers.leapfrogai import auth from leapfrogai_api.routers.leapfrogai import models as lfai_models from leapfrogai_api.routers.leapfrogai import vector_stores as lfai_vector_stores from leapfrogai_api.routers.leapfrogai import count as lfai_token_count +from leapfrogai_api.routers.leapfrogai import rag as lfai_rag from leapfrogai_api.routers.openai import ( assistants, audio, @@ -29,6 +30,7 @@ vector_stores, ) from leapfrogai_api.utils import get_model_config +from prometheus_fastapi_instrumentator import Instrumentator logging.basicConfig( level=os.getenv("LFAI_LOG_LEVEL", logging.INFO), @@ -61,6 +63,21 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) +@app.get("/", include_in_schema=False) +async def root(): + """Intercepts the root path and redirects to the API documentation.""" + return RedirectResponse(url="/docs") + + +Instrumentator( + excluded_handlers=["/healthz", "/metrics"], + should_group_status_codes=False, +).instrument(app).expose( + app, + include_in_schema=False, +) + + @app.exception_handler(RequestValidationError) async def validation_exception_handler(request, exc): logger.error(f"The client sent invalid data!: {exc}") @@ -81,6 +98,8 @@ async def validation_exception_handler(request, exc): app.include_router(messages.router) app.include_router(runs_steps.router) app.include_router(lfai_vector_stores.router) +if os.environ.get("DEV"): + app.include_router(lfai_rag.router) app.include_router(lfai_token_count.router) app.include_router(lfai_models.router) # This should be at the bottom to prevent it preempting more specific runs endpoints diff --git a/src/leapfrogai_api/pyproject.toml b/src/leapfrogai_api/pyproject.toml index a18f6422f..4542f7922 100644 --- a/src/leapfrogai_api/pyproject.toml +++ b/src/leapfrogai_api/pyproject.toml @@ -26,6 +26,8 @@ dependencies = [ "postgrest==0.16.11", # required by supabase, bug when using previous versions "openpyxl == 3.1.5", "psutil == 6.0.0", + "prometheus-fastapi-instrumentator == 7.0.0", + "rerankers[flashrank] == 0.5.3" ] requires-python = "~=3.11" diff --git a/src/leapfrogai_api/routers/leapfrogai/rag.py b/src/leapfrogai_api/routers/leapfrogai/rag.py new file mode 100644 index 000000000..3b61b616e --- /dev/null +++ b/src/leapfrogai_api/routers/leapfrogai/rag.py @@ -0,0 +1,56 @@ +"""LeapfrogAI endpoints for RAG.""" + +from fastapi import APIRouter +from leapfrogai_api.typedef.rag.rag_types import ( + ConfigurationSingleton, + ConfigurationPayload, +) +from leapfrogai_api.routers.supabase_session import Session +from leapfrogai_api.utils.logging_tools import logger + +router = APIRouter(prefix="/leapfrogai/v1/rag", tags=["leapfrogai/rag"]) + + +@router.patch("/configure") +async def configure(session: Session, configuration: ConfigurationPayload) -> None: + """ + Configures the RAG settings at runtime. + + Args: + session (Session): The database session. + configuration (Configuration): The configuration to update. + """ + + # We set the class variable to update the configuration globally + ConfigurationSingleton._instance = ConfigurationSingleton.get_instance().copy( + update=configuration.dict(exclude_none=True) + ) + + +@router.get("/configure") +async def get_configuration(session: Session) -> ConfigurationPayload: + """ + Retrieves the current RAG configuration. + + Args: + session (Session): The database session. + + Returns: + Configuration: The current RAG configuration. + """ + + instance = ConfigurationSingleton.get_instance() + + # Create a new dictionary with only the relevant attributes + config_dict = { + key: value + for key, value in instance.__dict__.items() + if not key.startswith("_") # Exclude private attributes + } + + # Create a new ConfigurationPayload instance with the filtered dictionary + new_configuration = ConfigurationPayload(**config_dict) + + logger.info(f"The current configuration has been set to {new_configuration}") + + return new_configuration diff --git a/src/leapfrogai_api/routers/leapfrogai/vector_stores.py b/src/leapfrogai_api/routers/leapfrogai/vector_stores.py index 09f8f4a77..5251440c1 100644 --- a/src/leapfrogai_api/routers/leapfrogai/vector_stores.py +++ b/src/leapfrogai_api/routers/leapfrogai/vector_stores.py @@ -33,9 +33,7 @@ async def search( """ query_service = QueryService(db=session) return await query_service.query_rag( - query=query, - vector_store_id=vector_store_id, - k=k, + query=query, vector_store_id=vector_store_id, k=k ) diff --git a/src/leapfrogai_api/typedef/completion/completion_types.py b/src/leapfrogai_api/typedef/completion/completion_types.py index 9a5cdad95..f92d91f28 100644 --- a/src/leapfrogai_api/typedef/completion/completion_types.py +++ b/src/leapfrogai_api/typedef/completion/completion_types.py @@ -7,15 +7,48 @@ class FinishReason(Enum): - NONE = 0 # Maps to "None" - STOP = 1 # Maps to "stop" - LENGTH = 2 # Maps to "length" + NONE = 0 + STOP = 1 + LENGTH = 2 - def to_string(self) -> str | None: + def to_finish_reason(self) -> str | None: + """ + Convert the enum member to its corresponding finish reason string. + + Returns: + str | None: The finish reason as a lowercase string if it is not NONE; otherwise, None. + """ if self == FinishReason.NONE: return None return self.name.lower() + @classmethod + def _missing_(cls, value): + """ + Handle missing values when creating an enum instance. + + This method is called when a value passed to the enum constructor does not match any existing enum members. + It provides custom logic to map input values to enum members or raises an error if the value is invalid. + + Args: + value: The value that was not found among the enum members. + + Returns: + FinishReason: The corresponding enum member after applying custom mapping. + + Raises: + ValueError: If the value cannot be mapped to any enum member. + """ + # Handle custom value mappings + if value is None or value == "None": + return cls.NONE + elif value == "stop": + return cls.STOP + elif value == "length": + return cls.LENGTH + else: + raise ValueError(f"Invalid FinishReason value: {value}") + class CompletionChoice(BaseModel): """Choice object for completion.""" diff --git a/src/leapfrogai_api/typedef/rag/__init__.py b/src/leapfrogai_api/typedef/rag/__init__.py new file mode 100644 index 000000000..65c2e26cd --- /dev/null +++ b/src/leapfrogai_api/typedef/rag/__init__.py @@ -0,0 +1,3 @@ +from .rag_types import ( + ConfigurationSingleton as ConfigurationSingleton, +) diff --git a/src/leapfrogai_api/typedef/rag/rag_types.py b/src/leapfrogai_api/typedef/rag/rag_types.py new file mode 100644 index 000000000..17fe6601c --- /dev/null +++ b/src/leapfrogai_api/typedef/rag/rag_types.py @@ -0,0 +1,40 @@ +from typing import Optional + +from pydantic import BaseModel, Field + + +class ConfigurationSingleton: + """Singleton manager for ConfigurationPayload.""" + + _instance = None + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = ConfigurationPayload() + cls._instance.enable_reranking = True + cls._instance.rag_top_k_when_reranking = 100 + cls._instance.ranking_model = "flashrank" + return cls._instance + + +class ConfigurationPayload(BaseModel): + """Response for RAG configuration.""" + + enable_reranking: Optional[bool] = Field( + default=None, + examples=[True, False], + description="Enables reranking for RAG queries", + ) + # More model info can be found here: + # https://github.com/AnswerDotAI/rerankers?tab=readme-ov-file + # https://pypi.org/project/rerankers/ + ranking_model: Optional[str] = Field( + default=None, + description="What model to use for reranking. Some options may require additional python dependencies.", + examples=["flashrank", "rankllm", "cross-encoder", "colbert"], + ) + rag_top_k_when_reranking: Optional[int] = Field( + default=None, + description="The top-k results returned from the RAG call before reranking", + ) diff --git a/src/leapfrogai_api/typedef/vectorstores/search_types.py b/src/leapfrogai_api/typedef/vectorstores/search_types.py index d8d2a2d13..ea69df1fe 100644 --- a/src/leapfrogai_api/typedef/vectorstores/search_types.py +++ b/src/leapfrogai_api/typedef/vectorstores/search_types.py @@ -1,3 +1,5 @@ +from typing import Optional + from pydantic import BaseModel, Field @@ -25,6 +27,14 @@ class SearchItem(BaseModel): similarity: float = Field( ..., description="Similarity score of this item to the query." ) + rank: Optional[int] = Field( + default=None, + description="The rank of this search item after ranking has occurred.", + ) + score: Optional[float] = Field( + default=None, + description="The score of this search item after ranking has occurred.", + ) class SearchResponse(BaseModel): diff --git a/src/leapfrogai_api/utils/logging_tools.py b/src/leapfrogai_api/utils/logging_tools.py new file mode 100644 index 000000000..aa2448288 --- /dev/null +++ b/src/leapfrogai_api/utils/logging_tools.py @@ -0,0 +1,12 @@ +import os +import logging +from dotenv import load_dotenv + +load_dotenv() + +logging.basicConfig( + level=os.getenv("LFAI_LOG_LEVEL", logging.INFO), + format="%(name)s: %(asctime)s | %(levelname)s | %(filename)s:%(lineno)s >>> %(message)s", +) + +logger = logging.getLogger(__name__) diff --git a/src/leapfrogai_evals/pyproject.toml b/src/leapfrogai_evals/pyproject.toml index 1974da81a..9726c51c0 100644 --- a/src/leapfrogai_evals/pyproject.toml +++ b/src/leapfrogai_evals/pyproject.toml @@ -8,7 +8,7 @@ version = "0.13.1" dependencies = [ "deepeval == 1.3.0", - "openai == 1.42.0", + "openai == 1.45.0", "tqdm == 4.66.5", "python-dotenv == 1.0.1", "seaborn == 0.13.2", @@ -16,7 +16,8 @@ dependencies = [ "huggingface-hub == 0.24.6", "anthropic ==0.34.2", "instructor ==1.4.3", - "pyPDF2 == 3.0.1" + "pyPDF2 == 3.0.1", + "python-dotenv == 1.0.1" ] requires-python = "~=3.11" readme = "README.md" diff --git a/tasks.yaml b/tasks.yaml new file mode 100644 index 000000000..2298757ba --- /dev/null +++ b/tasks.yaml @@ -0,0 +1,133 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.14.0/tasks.schema.json + +includes: + - badge: https://raw.githubusercontent.com/defenseunicorns/uds-common/82e63be82766a2e550a847af904b2d738c9d3478/tasks/badge.yaml + +tasks: + - name: nightly-uds-badge-verification + description: "Runs in a pipeline and produces a report for archiving" + actions: + - description: "Create Reports Directory" + cmd: | + mkdir -p reports + - description: "Run UDS Badge Verification Task" + cmd: | + uds run verify-uds-badge-cpu --no-progress 2>&1 | tee ./reports/intermediate-report.txt + - description: "Clean Up Final Report" + cmd: | + python3 .github/scripts/uds_verification_report.py | tee ./reports/final-report.txt + + ############# + # BADGE TASKS + ############# + - name: verify-uds-badge-cpu + description: "Runs through all CPU UDS bundle packages with the UDS badge verification test" + actions: + - task: verify-uds-badge-api + - task: verify-uds-badge-ui + - task: verify-uds-badge-llama-cpp-python + - task: verify-uds-badge-text-embeddings + - task: verify-uds-badge-whisper + - task: verify-uds-badge-supabase + + - name: verify-uds-badge-gpu + description: "Runs through all GPU UDS bundle packages with the UDS badge verification test" + actions: + - task: verify-uds-badge-api + - task: verify-uds-badge-ui + - task: verify-uds-badge-vllm + - task: verify-uds-badge-text-embeddings + - task: verify-uds-badge-whisper + - task: verify-uds-badge-supabase + + ####################### + # RE-USABLE BADGE TASKS + ####################### + + - name: verify-uds-badge-api + actions: + - description: "Verify API" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="true" \ + --set PACKAGE_DIR="packages/api" \ + --no-progress + + - name: verify-uds-badge-ui + actions: + - description: "Verify UI" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="false" \ + --set PACKAGE_DIR="packages/ui" \ + --no-progress + + - name: verify-uds-badge-llama-cpp-python + actions: + - description: "Verify LLaMA-CPP-Python" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="false" \ + --set PACKAGE_DIR="packages/llama-cpp-python" \ + --no-progress + + - name: verify-uds-badge-vllm + actions: + - description: "Verify vLLM" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="false" \ + --set PACKAGE_DIR="packages/vllm" \ + --no-progress + + - name: verify-uds-badge-text-embeddings + actions: + - description: "Verify Text-Embeddings" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="false" \ + --set PACKAGE_DIR="packages/text-embeddings" \ + --no-progress + + - name: verify-uds-badge-whisper + actions: + - description: "Verify Whisper" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="false" \ + --set PACKAGE_DIR="packages/whisper" \ + --no-progress + + - name: verify-uds-badge-repeater + actions: + - description: "Verify Repeater" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="false" \ + --set PACKAGE_DIR="packages/repeater" \ + --no-progress + + - name: verify-uds-badge-supabase + actions: + - description: "Verify Supabase" + cmd: | + uds run badge:verify-badge \ + --set CHART_PATH="chart" \ + --set GROUP_NAME="package" \ + --set COMMON_ZARF="false" \ + --set PACKAGE_DIR="packages/supabase" \ + --no-progress diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 4f498b102..580034011 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -1,12 +1,14 @@ +from openai import OpenAI import pytest -from openai import OpenAI +from tests.utils.client import leapfrogai_client, get_leapfrogai_model -from .utils import create_test_user + +@pytest.fixture(scope="module") +def client() -> OpenAI: + return leapfrogai_client() @pytest.fixture(scope="module") -def client(): - return OpenAI( - base_url="https://leapfrogai-api.uds.dev/openai/v1", api_key=create_test_user() - ) +def model_name() -> str: + return get_leapfrogai_model() diff --git a/tests/e2e/test_api.py b/tests/e2e/test_api.py index b556954e0..44e533645 100644 --- a/tests/e2e/test_api.py +++ b/tests/e2e/test_api.py @@ -5,7 +5,7 @@ import pytest as pytest import requests -from .utils import create_test_user +from tests.utils.client import create_test_user logger = logging.getLogger(__name__) test_id = str(uuid.uuid4()) diff --git a/tests/e2e/test_llm_generation.py b/tests/e2e/test_llm_generation.py index badb0dd3e..cb309d597 100644 --- a/tests/e2e/test_llm_generation.py +++ b/tests/e2e/test_llm_generation.py @@ -1,41 +1,28 @@ -import os from typing import Iterable -import warnings import pytest from openai import InternalServerError, OpenAI from openai.types.chat import ChatCompletionMessageParam from tests.utils.data_path import data_path, WAV_FILE -DEFAULT_LEAPFROGAI_MODEL = "llama-cpp-python" - - -def get_model_name(): - model_name = os.getenv("LEAPFROGAI_MODEL") - if model_name is None: - warnings.warn( - f"LEAPFROGAI_MODEL environment variable not set. Defaulting to '{DEFAULT_LEAPFROGAI_MODEL}'.\n" - "Consider setting LEAPFROGAI_MODEL explicitly. Examples: 'vllm', 'repeater', 'llama-cpp-python'." - ) - model_name = DEFAULT_LEAPFROGAI_MODEL - return model_name - - -@pytest.fixture -def model_name(): - return get_model_name() +# Test generation parameters +SYSTEM_PROMPT = "You are a helpful assistant." +USER_PROMPT = "Only return 1 word" +MAX_TOKENS = 128 +TEMPERATURE = 0 def test_chat_completions(client: OpenAI, model_name: str): messages: Iterable[ChatCompletionMessageParam] = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "What is your name?"}, + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": USER_PROMPT}, ] chat_completion = client.chat.completions.create( model=model_name, messages=messages, - max_tokens=128, + max_tokens=MAX_TOKENS, + temperature=TEMPERATURE, ) assert chat_completion.model == model_name assert len(chat_completion.choices) == 1 @@ -51,8 +38,9 @@ def test_chat_completions(client: OpenAI, model_name: str): def test_completions(client: OpenAI, model_name: str): completion = client.completions.create( model=model_name, - prompt="Only return 1 word", - max_tokens=128, + prompt=USER_PROMPT, + max_tokens=MAX_TOKENS, + temperature=TEMPERATURE, ) assert completion.model == model_name assert len(completion.choices) == 1 diff --git a/tests/e2e/test_supabase.py b/tests/e2e/test_supabase.py index 1e98f2ec4..c9302c6be 100644 --- a/tests/e2e/test_supabase.py +++ b/tests/e2e/test_supabase.py @@ -17,7 +17,7 @@ from leapfrogai_api.data.crud_vector_store_file import CRUDVectorStoreFile -from .utils import ANON_KEY, create_test_user, SERVICE_KEY +from tests.utils.client import ANON_KEY, create_test_user, SERVICE_KEY from openai.types import FileObject health_urls = { diff --git a/tests/e2e/test_text_backend_full.py b/tests/e2e/test_text_backend_full.py index fdee17172..d1f28bcf4 100644 --- a/tests/e2e/test_text_backend_full.py +++ b/tests/e2e/test_text_backend_full.py @@ -21,7 +21,7 @@ def download_arxiv_pdf(): ) -def test_run_with_background_task(client: OpenAI): +def test_run_with_background_task(client: OpenAI, model_name: str): """ This test confirms whether a vector store for an assistant can index files while chatting at the same time. @@ -52,7 +52,7 @@ def test_run_with_background_task(client: OpenAI): # Create an assistant assistant = client.beta.assistants.create( - model="llama-cpp-python", + model=model_name, name="Test Assistant", instructions="You are a helpful assistant with access to a knowledge base about AI and machine learning.", tools=[{"type": "file_search"}], diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py deleted file mode 100644 index 32eb8daff..000000000 --- a/tests/e2e/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -import json -import logging -import os -import traceback -import pytest -import requests - -# This is the anon_key for supabase, it provides access to the endpoints that would otherwise be inaccessible -ANON_KEY = os.environ["ANON_KEY"] -SERVICE_KEY = os.environ["SERVICE_KEY"] -DEFAULT_TEST_EMAIL = "fakeuser1@test.com" -DEFAULT_TEST_PASSWORD = "password" - - -def create_test_user( - anon_key: str = ANON_KEY, - email: str = DEFAULT_TEST_EMAIL, - password: str = DEFAULT_TEST_PASSWORD, -) -> str: - headers = { - "apikey": f"{anon_key}", - "Authorization": f"Bearer {anon_key}", - "Content-Type": "application/json", - } - - try: - requests.post( - url="https://supabase-kong.uds.dev/auth/v1/signup", - headers=headers, - json={ - "email": email, - "password": password, - "confirmPassword": password, - }, - ) - except Exception: - logging.error( - "Error creating user (likely because the user already exists): %s", - traceback.format_exc(), - ) - - return get_jwt_token(anon_key, email, password) - - -def get_jwt_token( - api_key: str, - test_email: str = DEFAULT_TEST_EMAIL, - test_password: str = DEFAULT_TEST_PASSWORD, -) -> str: - url = "https://supabase-kong.uds.dev/auth/v1/token?grant_type=password" - headers = {"apikey": f"{api_key}", "Content-Type": "application/json"} - data = {"email": test_email, "password": test_password} - - response = requests.post(url, headers=headers, json=data) - if response.status_code != 200: - pytest.fail( - f"Request for the JWT token failed with status code {response.status_code} expected 200", - False, - ) - - return json.loads(response.content)["access_token"] diff --git a/tests/integration/api/test_rag_files.py b/tests/integration/api/test_rag_files.py index 45f832418..7520ddbcc 100644 --- a/tests/integration/api/test_rag_files.py +++ b/tests/integration/api/test_rag_files.py @@ -1,9 +1,13 @@ import os +from typing import Optional + +import requests from openai.types.beta.threads.text import Text import pytest from tests.utils.data_path import data_path -from tests.utils.client import client_config_factory +from leapfrogai_api.typedef.rag.rag_types import ConfigurationPayload +from tests.utils.client import client_config_factory, get_leapfrogai_api_url_base def make_test_assistant(client, model, vector_store_id): @@ -77,3 +81,66 @@ def test_rag_needle_haystack(): for a in message_content.annotations: print(a.text) + + +def configure_rag( + enable_reranking: bool, + ranking_model: str, + rag_top_k_when_reranking: int, +): + """ + Configures the RAG settings. + + Args: + enable_reranking: Whether to enable reranking. + ranking_model: The ranking model to use. + rag_top_k_when_reranking: The top-k results to return before reranking. + """ + url = f"{get_leapfrogai_api_url_base()}/leapfrogai/v1/rag/configure" + configuration = ConfigurationPayload( + enable_reranking=enable_reranking, + ranking_model=ranking_model, + rag_top_k_when_reranking=rag_top_k_when_reranking, + ) + + try: + response = requests.patch(url, json=configuration.model_dump()) + response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) + print("RAG configuration updated successfully.") + except requests.exceptions.RequestException as e: + print(f"Error configuring RAG: {e}") + + +def get_rag_configuration() -> Optional[ConfigurationPayload]: + """ + Retrieves the current RAG configuration. + + Args: + base_url: The base URL of the API. + + Returns: + The RAG configuration, or None if there was an error. + """ + url = f"{get_leapfrogai_api_url_base()}/leapfrogai/v1/rag/configure" + + try: + response = requests.get(url) + response.raise_for_status() + config = ConfigurationPayload.model_validate_json(response.text) + print(f"Current RAG configuration: {config}") + return config + except requests.exceptions.RequestException as e: + print(f"Error getting RAG configuration: {e}") + return None + + +@pytest.mark.skipif( + os.environ.get("LFAI_RUN_NIAH_TESTS") != "true", + reason="LFAI_RUN_NIAH_TESTS envvar was not set to true", +) +def test_rag_needle_haystack_with_reranking(): + configure_rag(True, "flashrank", 100) + config_result = get_rag_configuration() + assert config_result is not None + assert config_result.enable_reranking is True + test_rag_needle_haystack() diff --git a/tests/integration/api/test_vector_stores.py b/tests/integration/api/test_vector_stores.py index 5427a0943..9a3be72a4 100644 --- a/tests/integration/api/test_vector_stores.py +++ b/tests/integration/api/test_vector_stores.py @@ -1,7 +1,6 @@ """Test the API endpoints for assistants.""" import json -import os import time import pytest @@ -19,6 +18,7 @@ ) from leapfrogai_api.routers.openai.vector_stores import router as vector_store_router from leapfrogai_api.routers.openai.files import router as files_router +from tests.utils.client import create_test_user from tests.utils.data_path import data_path, TXT_FILE INSTRUCTOR_XL_EMBEDDING_SIZE: int = 768 @@ -37,11 +37,11 @@ class MissingEnvironmentVariable(Exception): headers: dict[str, str] = {} try: - headers = {"Authorization": f"Bearer {os.environ['SUPABASE_USER_JWT']}"} + headers = {"Authorization": f"Bearer {create_test_user()}"} except KeyError as exc: raise MissingEnvironmentVariable( "SUPABASE_USER_JWT must be defined for the test to pass. " - "Please check the api README for instructions on obtaining this token." + "Please check the packages/api and src/leapfrogai_api READMEs for instructions on obtaining this token." ) from exc vector_store_client = TestClient(vector_store_router, headers=headers) diff --git a/tests/pytest/leapfrogai_api/test_api.py b/tests/pytest/leapfrogai_api/test_api.py index 724b0dc58..ec6460fda 100644 --- a/tests/pytest/leapfrogai_api/test_api.py +++ b/tests/pytest/leapfrogai_api/test_api.py @@ -32,6 +32,7 @@ ) TEXT_INPUT_LEN = len(TEXT_INPUT) + ######################### ######################### @@ -147,6 +148,7 @@ def test_routes(): "/openai/v1/files": ["POST"], "/openai/v1/assistants": ["POST"], "/leapfrogai/v1/count/tokens": ["POST"], + "/leapfrogai/v1/rag/configure": ["GET", "PATCH"], } openai_routes = [ @@ -196,10 +198,14 @@ def test_routes(): ] actual_routes = app.routes - for route in actual_routes: - if hasattr(route, "path") and route.path in expected_routes: - assert route.methods == set(expected_routes[route.path]) - del expected_routes[route.path] + for expected_route in expected_routes: + matching_routes = {expected_route: []} + for actual_route in actual_routes: + if hasattr(actual_route, "path") and expected_route == actual_route.path: + matching_routes[actual_route.path].extend(actual_route.methods) + assert set(expected_routes[expected_route]) <= set( + matching_routes[expected_route] + ) for route, name, methods in openai_routes: found = False @@ -214,8 +220,6 @@ def test_routes(): break assert found, f"Missing route: {route}, {name}, {methods}" - assert len(expected_routes) == 0 - def test_healthz(): """Test the healthz endpoint.""" @@ -535,3 +539,55 @@ def test_token_count(dummy_auth_middleware): assert "token_count" in response_data assert isinstance(response_data["token_count"], int) assert response_data["token_count"] == len(input_text) + + +@pytest.mark.skipif( + os.environ.get("LFAI_RUN_REPEATER_TESTS") != "true" + or os.environ.get("DEV") != "true", + reason="LFAI_RUN_REPEATER_TESTS envvar was not set to true", +) +def test_configure(dummy_auth_middleware): + """Test the RAG configuration endpoints.""" + with TestClient(app) as client: + rag_configuration_request = { + "enable_reranking": True, + "ranking_model": "rankllm", + "rag_top_k_when_reranking": 50, + } + response = client.patch( + "/leapfrogai/v1/rag/configure", json=rag_configuration_request + ) + assert response.status_code == 200 + + response = client.get("/leapfrogai/v1/rag/configure") + assert response.status_code == 200 + response_data = response.json() + assert "enable_reranking" in response_data + assert "ranking_model" in response_data + assert "rag_top_k_when_reranking" in response_data + assert isinstance(response_data["enable_reranking"], bool) + assert isinstance(response_data["ranking_model"], str) + assert isinstance(response_data["rag_top_k_when_reranking"], int) + assert response_data["enable_reranking"] is True + assert response_data["ranking_model"] == "rankllm" + assert response_data["rag_top_k_when_reranking"] == 50 + + # Update only some of the configs to see if the existing ones persist + rag_configuration_request = {"ranking_model": "flashrank"} + response = client.patch( + "/leapfrogai/v1/rag/configure", json=rag_configuration_request + ) + assert response.status_code == 200 + + response = client.get("/leapfrogai/v1/rag/configure") + assert response.status_code == 200 + response_data = response.json() + assert "enable_reranking" in response_data + assert "ranking_model" in response_data + assert "rag_top_k_when_reranking" in response_data + assert isinstance(response_data["enable_reranking"], bool) + assert isinstance(response_data["ranking_model"], str) + assert isinstance(response_data["rag_top_k_when_reranking"], int) + assert response_data["enable_reranking"] is True + assert response_data["ranking_model"] == "flashrank" + assert response_data["rag_top_k_when_reranking"] == 50 diff --git a/tests/utils/client.py b/tests/utils/client.py index 6fe598514..0016f8c4c 100644 --- a/tests/utils/client.py +++ b/tests/utils/client.py @@ -1,8 +1,113 @@ +import json +import logging +import traceback from urllib.parse import urljoin from openai import OpenAI import os +import pytest import requests from requests import Response +from fastapi import status + +ANON_KEY = os.environ["ANON_KEY"] +SERVICE_KEY = os.environ["SERVICE_KEY"] +DEFAULT_TEST_EMAIL = "test-user@test.com" +DEFAULT_TEST_PASSWORD = "password" + + +def get_supabase_url() -> str: + """Get the URL for Supabase. + + Returns: + str: The URL for Supabase. (default: "https://supabase-kong.uds.dev") + """ + + return os.getenv("SUPABASE_URL", "https://supabase-kong.uds.dev") + + +def create_test_user( + anon_key: str = ANON_KEY, + email: str = DEFAULT_TEST_EMAIL, + password: str = DEFAULT_TEST_PASSWORD, +) -> str: + """ + Create a test user in the authentication system. + + This function attempts to create a new user with the given email and password using the specified + anonymous API key. If the user already exists, the error is logged. It returns the JWT token + for the created or existing user. + + Args: + anon_key (str): The anonymous API key for authentication service. + email (str): The email address of the test user. Default is "fakeuser1@test.com". + password (str): The password for the test user. Default is "password". + + Returns: + str: The JWT token for the created or existing user. + """ + supabase_base_url = get_supabase_url() + + headers = { + "apikey": f"{anon_key}", + "Authorization": f"Bearer {anon_key}", + "Content-Type": "application/json", + } + + try: + requests.post( + url=f"{supabase_base_url}/auth/v1/signup", + headers=headers, + json={ + "email": email, + "password": password, + "confirmPassword": password, + }, + ) + except Exception: + logging.error( + "Error creating user (likely because the user already exists): %s", + traceback.format_exc(), + ) + + return get_jwt_token(supabase_base_url, anon_key, email, password) + + +def get_jwt_token( + supabase_base_url: str, + api_key: str, + test_email: str = DEFAULT_TEST_EMAIL, + test_password: str = DEFAULT_TEST_PASSWORD, +) -> str: + """ + Retrieve a JWT token for a test user using email and password. + + This function sends a request to the authentication service to obtain a JWT token using + the provided API key, email, and password. + + Args: + api_key (str): The API key for the authentication service. + test_email (str): The email address of the test user. Default is "fakeuser1@test.com". + test_password (str): The password for the test user. Default is "password". + + Returns: + str: The JWT access token for the authenticated user. + + Raises: + AssertionError: If the request fails or the response status code is not 200. + """ + + url = f"{supabase_base_url}/auth/v1/token?grant_type=password" + headers = {"apikey": f"{api_key}", "Content-Type": "application/json"} + data = {"email": test_email, "password": test_password} + + response = requests.post(url, headers=headers, json=data) + if response.status_code != status.HTTP_200_OK: + pytest.fail( + f"Request for the JWT token failed with status code {response.status_code} expected 200", + False, + ) + + return json.loads(response.content)["access_token"] def get_leapfrogai_model() -> str: @@ -12,7 +117,15 @@ def get_leapfrogai_model() -> str: str: The model to use for LeapfrogAI. (default: "vllm") """ - return os.getenv("LEAPFROGAI_MODEL", "vllm") + model = os.getenv("LEAPFROGAI_MODEL") + + if not model: + model = "vllm" + logging.warning( + f"LEAPFROGAI_MODEL is not set, using default model of `{model}`" + ) + + return model def get_openai_key() -> str: @@ -49,14 +162,18 @@ def get_leapfrogai_api_key() -> str: Returns: str: The API key for the LeapfrogAI API. + Raises: ValueError: If LEAPFROGAI_API_KEY or SUPABASE_USER_JWT is not set. """ api_key = os.getenv("LEAPFROGAI_API_KEY") or os.getenv("SUPABASE_USER_JWT") - if api_key is None: - raise ValueError("LEAPFROGAI_API_KEY or SUPABASE_USER_JWT not set") + if not api_key: + logging.warning( + "LEAPFROGAI_API_KEY or SUPABASE_USER_JWT not set, automatically generating test user." + ) + return create_test_user() return api_key @@ -74,9 +191,9 @@ def get_leapfrogai_api_url() -> str: def get_leapfrogai_api_url_base() -> str: """Get the base URL for the LeapfrogAI API. - Set via the LEAPFRAGAI_API_URL environment variable. + Set via the LEAPFROGAI_API_URL environment variable. - If LEAPFRAGAI_API_URL is set to "https://leapfrogai-api.uds.dev/openai/v1", this will trim off the "/openai/v1" part. + If LEAPFROGAI_API_URL is set to "https://leapfrogai-api.uds.dev/openai/v1", this will trim off the "/openai/v1" part. Returns: str: The base URL for the LeapfrogAI API. (default: "https://leapfrogai-api.uds.dev")