From 81ff6a3d7a714e0920bea0f75d47189c2f47fdb9 Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Wed, 2 Oct 2024 22:24:37 -0400 Subject: [PATCH 1/4] feat(api): add prometheus monitoring (#1166) Adds a python library for exposing basic fastapi metrics in prometheus syntax on /metrics endpoint Add a policy to block access to the /metrics from outside the cluster Adds a service monitor to api deployment so prometheus will scrape metrics Redirect base url to `/docs` to prevent not found errors. --- packages/api/chart/templates/istio-admin.yaml | 24 +++++++++++++++++++ packages/api/chart/templates/uds-package.yaml | 5 ++++ src/leapfrogai_api/main.py | 18 +++++++++++++- src/leapfrogai_api/pyproject.toml | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 packages/api/chart/templates/istio-admin.yaml diff --git a/packages/api/chart/templates/istio-admin.yaml b/packages/api/chart/templates/istio-admin.yaml new file mode 100644 index 000000000..c369e8786 --- /dev/null +++ b/packages/api/chart/templates/istio-admin.yaml @@ -0,0 +1,24 @@ +{{- if .Capabilities.APIVersions.Has "security.istio.io/v1beta1" }} +apiVersion: security.istio.io/v1beta1 +kind: AuthorizationPolicy +metadata: + name: api-block-metrics-access-from-public-gateway + namespace: {{ .Release.Namespace }} +spec: + selector: + matchLabels: + {{- include "chart.selectorLabels" . | nindent 6 }} + action: DENY + rules: + - to: + - operation: + ports: + - "8080" + paths: + - /metrics* + from: + - source: + notNamespaces: + - istio-admin-gateway + - monitoring +{{- end }} diff --git a/packages/api/chart/templates/uds-package.yaml b/packages/api/chart/templates/uds-package.yaml index a6a83dea8..17220788d 100644 --- a/packages/api/chart/templates/uds-package.yaml +++ b/packages/api/chart/templates/uds-package.yaml @@ -7,6 +7,11 @@ metadata: labels: {{- include "chart.labels" . | nindent 4 }} spec: + monitor: + - portName: http + targetPort: {{ .Values.api.service.port }} + selector: + {{- include "chart.selectorLabels" . | nindent 8 }} network: expose: - service: {{ include "chart.fullname" . }} diff --git a/src/leapfrogai_api/main.py b/src/leapfrogai_api/main.py index f9b3682d4..108ccd51e 100644 --- a/src/leapfrogai_api/main.py +++ b/src/leapfrogai_api/main.py @@ -8,7 +8,7 @@ from fastapi import FastAPI from fastapi.exception_handlers import request_validation_exception_handler from fastapi.exceptions import RequestValidationError - +from fastapi.responses import RedirectResponse from leapfrogai_api.routers.base import router as base_router from leapfrogai_api.routers.leapfrogai import auth from leapfrogai_api.routers.leapfrogai import models as lfai_models @@ -30,6 +30,7 @@ vector_stores, ) from leapfrogai_api.utils import get_model_config +from prometheus_fastapi_instrumentator import Instrumentator logging.basicConfig( level=os.getenv("LFAI_LOG_LEVEL", logging.INFO), @@ -62,6 +63,21 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) +@app.get("/", include_in_schema=False) +async def root(): + """Intercepts the root path and redirects to the API documentation.""" + return RedirectResponse(url="/docs") + + +Instrumentator( + excluded_handlers=["/healthz", "/metrics"], + should_group_status_codes=False, +).instrument(app).expose( + app, + include_in_schema=False, +) + + @app.exception_handler(RequestValidationError) async def validation_exception_handler(request, exc): logger.error(f"The client sent invalid data!: {exc}") diff --git a/src/leapfrogai_api/pyproject.toml b/src/leapfrogai_api/pyproject.toml index ea9b8f7e4..4542f7922 100644 --- a/src/leapfrogai_api/pyproject.toml +++ b/src/leapfrogai_api/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "postgrest==0.16.11", # required by supabase, bug when using previous versions "openpyxl == 3.1.5", "psutil == 6.0.0", + "prometheus-fastapi-instrumentator == 7.0.0", "rerankers[flashrank] == 0.5.3" ] requires-python = "~=3.11" From bb5e58c7ad4014d01a13986644e8fbbc8d665618 Mon Sep 17 00:00:00 2001 From: Gato <115658935+CollectiveUnicorn@users.noreply.github.com> Date: Thu, 3 Oct 2024 08:54:57 -0700 Subject: [PATCH 2/4] fix(whisper): failing gpu pod due to mangled env variable * Removes newlines that were causing the creation of an invalid environment variable entry. * Resolves issue with failing gpu pod being unable to find the libcudnn_ops_infer.so file. --- packages/whisper/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/whisper/Dockerfile b/packages/whisper/Dockerfile index b3bed054a..a5513e9fa 100644 --- a/packages/whisper/Dockerfile +++ b/packages/whisper/Dockerfile @@ -37,8 +37,8 @@ COPY --from=builder /leapfrogai/.venv/ /leapfrogai/.venv/ # set the path to the cuda 11.8 dependencies ENV LD_LIBRARY_PATH \ - /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\ - /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib +/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\ +/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib COPY packages/whisper/main.py . From 432b380da1355c5f0f6e9612d257a0dba0015a3c Mon Sep 17 00:00:00 2001 From: Justin Law <81255462+justinthelaw@users.noreply.github.com> Date: Thu, 3 Oct 2024 12:05:50 -0400 Subject: [PATCH 3/4] fix: nightly snapshots repository typo (#1177) --- .github/workflows/nightly-snapshot-release.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly-snapshot-release.yaml b/.github/workflows/nightly-snapshot-release.yaml index 5025b6cae..da6abcdef 100644 --- a/.github/workflows/nightly-snapshot-release.yaml +++ b/.github/workflows/nightly-snapshot-release.yaml @@ -24,7 +24,7 @@ defaults: env: SNAPSHOT_VERSION: snapshot-latest - SNAPSHOT_SUB_REPOSITORY: /uds/snapshots + SNAPSHOT_SUB_REPOSITORY: /uds/snapshots/ permissions: contents: read @@ -170,6 +170,7 @@ jobs: env: ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }} SERVICE_KEY: ${{ steps.generate_secrets.outputs.SERVICE_KEY }} + LEAPFROGAI_MODEL: llama-cpp-python run: | python -m pytest -vvv -s ./tests/e2e From fd3cbc4178fed7b29e474f05ab1712b1a9f70481 Mon Sep 17 00:00:00 2001 From: Justin Law <81255462+justinthelaw@users.noreply.github.com> Date: Thu, 3 Oct 2024 12:07:20 -0400 Subject: [PATCH 4/4] feat(vllm)!: upgrade vllm backend and refactor deployment (#854) ### BREAKING CHANGES: - moves all ENV specific to LeapfrogAI SDK to a ConfigMap using `volumeMount` for runtime injection and modification - in local dev, this is defined via `config.yaml` - moves all ENV specific to vLLM to a ConfigMap, using `envFrom` for runtime injection and modification - in local dev, this is defined via `.env` - `ZARF_CONFIG` is used to define create-time and deploy-time variables for (e.g., `MODEL_REPO_ID`, `ENFORCE_EAGER`) - updates Make targets and workflows with new `ZARF_CONFIG` variable - updates UDS bundles with new Zarf deployment variable overrides - allows delivery engineer's declarative definition of the backend configs and model - re-introduces LFAI SDK `config.yaml` configuration method for local development and testing - MUST upgrade API and backends together due to `FinishReason` proto change --- .github/actions/release/action.yaml | 2 +- .github/workflows/e2e-vllm.yaml | 2 +- Makefile | 4 +- bundles/dev/gpu/uds-config.yaml | 27 +++- bundles/latest/gpu/uds-config.yaml | 27 +++- docs/DEVELOPMENT.md | 8 +- packages/vllm/.env.example | 25 ++- packages/vllm/Dockerfile | 54 ++----- packages/vllm/Makefile | 25 ++- packages/vllm/README.md | 53 ++++++- packages/vllm/chart/templates/deployment.yaml | 12 +- .../templates/leapfrogai-sdk-configmap.yaml | 37 +++++ .../templates/vllm-engine-configmap.yaml | 14 ++ packages/vllm/chart/values.yaml | 29 +++- packages/vllm/config.yaml | 17 ++ packages/vllm/pyproject.toml | 2 +- packages/vllm/src/config.py | 94 +++++++++-- packages/vllm/src/main.py | 148 +++++++----------- packages/vllm/src/model_download.py | 13 +- packages/vllm/values/upstream-values.yaml | 43 +++++ packages/vllm/zarf-config.yaml | 39 +++++ packages/vllm/zarf.yaml | 107 ++++++++++--- src/leapfrogai_api/backend/grpc_client.py | 4 +- src/leapfrogai_api/backend/helpers.py | 4 +- .../typedef/completion/completion_types.py | 41 ++++- 25 files changed, 610 insertions(+), 221 deletions(-) create mode 100644 packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml create mode 100644 packages/vllm/chart/templates/vllm-engine-configmap.yaml create mode 100644 packages/vllm/config.yaml create mode 100644 packages/vllm/zarf-config.yaml diff --git a/.github/actions/release/action.yaml b/.github/actions/release/action.yaml index 63afd9b1d..38157f59e 100644 --- a/.github/actions/release/action.yaml +++ b/.github/actions/release/action.yaml @@ -138,7 +138,7 @@ runs: run: | docker buildx build --build-arg LOCAL_VERSION=${{ inputs.releaseTag }} -t ghcr.io/defenseunicorns/leapfrogai/vllm:${{ inputs.releaseTag }} --push -f packages/vllm/Dockerfile . - zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm + ZARF_CONFIG=packages/vllm/zarf-config.yaml zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm zarf package publish zarf-package-vllm-amd64-${{ inputs.releaseTag }}.tar.zst oci://ghcr.io/defenseunicorns/packages${{ inputs.subRepository }}leapfrogai diff --git a/.github/workflows/e2e-vllm.yaml b/.github/workflows/e2e-vllm.yaml index ace153006..6f89948ad 100644 --- a/.github/workflows/e2e-vllm.yaml +++ b/.github/workflows/e2e-vllm.yaml @@ -88,4 +88,4 @@ jobs: ########## - name: Build vLLM run: | - make build-vllm LOCAL_VERSION=e2e-test + make build-vllm LOCAL_VERSION=e2e-test ZARF_CONFIG=packages/vllm/zarf-config.yaml diff --git a/Makefile b/Makefile index ed74b5ccf..da9266246 100644 --- a/Makefile +++ b/Makefile @@ -123,7 +123,7 @@ build-vllm: local-registry docker-vllm ## Build the vllm container and Zarf pack docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} ## Build the Zarf package - uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm + ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm docker-text-embeddings: sdk-wheel ## Build the image (and tag it for the local registry) @@ -263,7 +263,7 @@ silent-deploy-llama-cpp-python-package: silent-deploy-vllm-package: @echo "Starting VLLM deployment..." @mkdir -p .logs - @uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1 + @ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1 @echo "VLLM deployment completed" silent-deploy-text-embeddings-package: diff --git a/bundles/dev/gpu/uds-config.yaml b/bundles/dev/gpu/uds-config.yaml index 9ad6cfdb4..1ef7a2634 100644 --- a/bundles/dev/gpu/uds-config.yaml +++ b/bundles/dev/gpu/uds-config.yaml @@ -9,8 +9,31 @@ variables: gpu_limit: 0 # runs on CPU until GPU limit is increased vllm: - gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only - #tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in + trust_remote_code: "True" + tensor_parallel_size: "1" + enforce_eager: "False" + gpu_memory_utilization: "0.90" + worker_use_ray: "True" + engine_use_ray: "True" + quantization: "None" + load_format: "auto" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + max_context_length: "32768" + stop_tokens: ", <|im_end|>, <|endoftext|>" + prompt_format_chat_system: "SYSTEM: {}\n" + prompt_format_chat_user: "USER: {}\n" + prompt_format_chat_assistant: "ASSISTANT: {}\n" + temperature: "0.1" + top_p: "1.0" + top_k: "0" + repetition_penalty: "1.0" + max_new_tokens: "8192" + # Pod deployment configuration + gpu_limit: "1" + gpu_runtime: "nvidia" + pvc_size: "15Gi" + pvc_access_mode: "ReadWriteOnce" + pvc_storage_class: "local-path" supabase: domain: "uds.dev" diff --git a/bundles/latest/gpu/uds-config.yaml b/bundles/latest/gpu/uds-config.yaml index 9ad6cfdb4..1ef7a2634 100644 --- a/bundles/latest/gpu/uds-config.yaml +++ b/bundles/latest/gpu/uds-config.yaml @@ -9,8 +9,31 @@ variables: gpu_limit: 0 # runs on CPU until GPU limit is increased vllm: - gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only - #tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in + trust_remote_code: "True" + tensor_parallel_size: "1" + enforce_eager: "False" + gpu_memory_utilization: "0.90" + worker_use_ray: "True" + engine_use_ray: "True" + quantization: "None" + load_format: "auto" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + max_context_length: "32768" + stop_tokens: ", <|im_end|>, <|endoftext|>" + prompt_format_chat_system: "SYSTEM: {}\n" + prompt_format_chat_user: "USER: {}\n" + prompt_format_chat_assistant: "ASSISTANT: {}\n" + temperature: "0.1" + top_p: "1.0" + top_k: "0" + repetition_penalty: "1.0" + max_new_tokens: "8192" + # Pod deployment configuration + gpu_limit: "1" + gpu_runtime: "nvidia" + pvc_size: "15Gi" + pvc_access_mode: "ReadWriteOnce" + pvc_storage_class: "local-path" supabase: domain: "uds.dev" diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 9fefb8a7e..98343ef7f 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -13,20 +13,20 @@ Please first see the pre-requisites listed on the LeapfrogAI documentation websi It is **_HIGHLY RECOMMENDED_** that PyEnv be installed on your machine, and a new virtual environment is created for every new development branch. -Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.6: +Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.9: ```bash # install the correct python version - pyenv install 3.11.6 + pyenv install 3.11.9 # create a new virtual environment named "leapfrogai" - pyenv virtualenv 3.11.6 leapfrogai + pyenv virtualenv 3.11.9 leapfrogai # activate the virtual environment pyenv activate leapfrogai ``` -If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.6: +If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.9: ```bash sudo apt-get install build-essential zlib1g-dev libffi-dev \ diff --git a/packages/vllm/.env.example b/packages/vllm/.env.example index 1e3a00170..0a995e234 100644 --- a/packages/vllm/.env.example +++ b/packages/vllm/.env.example @@ -1,13 +1,12 @@ -export LAI_HF_HUB_ENABLE_HF_TRANSFER="1" -export LAI_REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ" -export LAI_REVISION="gptq-4bit-32g-actorder_True" -export LAI_QUANTIZATION="gptq" -export LAI_TENSOR_PARALLEL_SIZE=1 -export LAI_MODEL_SOURCE=".model/" -export LAI_MAX_CONTEXT_LENGTH=32768 -export LAI_STOP_TOKENS='["","<|endoftext|>","<|im_end|>"]' -export LAI_PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n" -export LAI_PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n" -export LAI_PROMPT_FORMAT_CHAT_USER="USER: {}\n" -export LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 -export LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0 \ No newline at end of file +LFAI_REPO_ID="TheBloke/SynthIA-7B-v2.0-GPTQ" +LFAI_REVISION="gptq-4bit-32g-actorder_True" + +VLLM_TENSOR_PARALLEL_SIZE=1 +VLLM_TRUST_REMOTE_CODE=True +VLLM_MAX_CONTEXT_LENGTH=32768 +VLLM_ENFORCE_EAGER=False +VLLM_GPU_MEMORY_UTILIZATION=0.90 +VLLM_WORKER_USE_RAY=True +VLLM_ENGINE_USE_RAY=True +VLLM_QUANTIZATION=None +VLLM_LOAD_FORMAT=auto diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile index 8676f5eda..f53088ead 100755 --- a/packages/vllm/Dockerfile +++ b/packages/vllm/Dockerfile @@ -6,8 +6,9 @@ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS builder # set SDK location # set the pyenv and Python versions ARG SDK_DEST=src/leapfrogai_sdk/build \ - PYTHON_VERSION=3.11.6 \ - PYENV_GIT_TAG=v2.4.8 + PYTHON_VERSION=3.11.9 \ + PYENV_GIT_TAG=v2.4.8\ + COMPONENT_DIRECTORY="packages/vllm" # use root user for deps installation and nonroot user creation USER root @@ -41,7 +42,7 @@ USER nonroot # copy-in SDK from sdk stage and vllm source code from host WORKDIR /home/leapfrogai COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST} -COPY --chown=nonroot:nonroot packages/vllm packages/vllm +COPY --chown=nonroot:nonroot ${COMPONENT_DIRECTORY} packages/vllm # create virtual environment for light-weight portability and minimal libraries RUN curl https://pyenv.run | bash && \ @@ -54,10 +55,10 @@ RUN curl https://pyenv.run | bash && \ ENV PYENV_ROOT="/home/nonroot/.pyenv" \ PATH="/home/nonroot/.pyenv/bin:$PATH" -# Install Python 3.11.6, set it as global, and create a venv +# Install Python, set it as global, and create a venv RUN . ~/.bashrc && \ - PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.6 && \ - pyenv global 3.11.6 && \ + PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.9 && \ + pyenv global ${PYTHON_VERSION} && \ pyenv exec python -m venv .venv # set path to venv python @@ -67,26 +68,15 @@ RUN rm -f packages/vllm/build/*.whl && \ python -m pip wheel packages/vllm -w packages/vllm/build --find-links=${SDK_DEST} && \ pip install packages/vllm/build/lfai_vllm*.whl --no-index --find-links=packages/vllm/build/ +################# +# FINAL CONTAINER +################# + FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 # set SDK location ARG SDK_DEST=src/leapfrogai_sdk/build -# model-specific arguments -ARG ARG HF_HUB_ENABLE_HF_TRANSFER="1" \ - REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ" \ - REVISION="gptq-4bit-32g-actorder_True" \ - MODEL_SOURCE="/data/.model/" \ - MAX_CONTEXT_LENGTH=32768 \ - STOP_TOKENS='[""]' \ - PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n" \ - PROMPT_FORMAT_CHAT_USER="USER: {}\n" \ - PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n" \ - PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \ - PROMPT_FORMAT_DEFAULTS_TOP_K=0 \ - TENSOR_PARALLEL_SIZE=1 \ - QUANTIZATION="gptq" - # setup nonroot user and permissions USER root RUN groupadd -g 65532 vglusers && \ @@ -101,24 +91,10 @@ COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST} COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/.venv /home/leapfrogai/.venv COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src /home/leapfrogai/packages/vllm/src # copy-in python binaries -COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/ - -# load ARG values into env variables for pickup by confz -ENV LAI_HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER} \ - LAI_REPO_ID=${REPO_ID} \ - LAI_REVISION=${REVISION} \ - LAI_MODEL_SOURCE=${MODEL_SOURCE} \ - LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \ - LAI_STOP_TOKENS=${STOP_TOKENS} \ - LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \ - LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \ - LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \ - LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \ - LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \ - LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \ - LAI_QUANTIZATION=${QUANTIZATION} \ - # remove vLLM callback to stats server - VLLM_NO_USAGE_STATS=1 +COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/ /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/ + +# remove vLLM callback to stats server +ENV VLLM_NO_USAGE_STATS=1 ENV PATH="/home/leapfrogai/.venv/bin:$PATH" diff --git a/packages/vllm/Makefile b/packages/vllm/Makefile index 98e8b29db..c764a78f2 100644 --- a/packages/vllm/Makefile +++ b/packages/vllm/Makefile @@ -1,6 +1,27 @@ +ARCH ?= amd64 +LOCAL_VERSION ?= $(shell git rev-parse --short HEAD) +DOCKER_FLAGS := + install: python -m pip install ../../src/leapfrogai_sdk python -m pip install -e ".[dev]" -dev: - python -m leapfrogai_sdk.cli --app-dir=src/ main:Model +download: + @env $$(cat .env | xargs) python src/model_download.py + +dev: download + @env $$(cat .env | xargs) python -m leapfrogai_sdk.cli --app-dir=src/ main:Model + +docker: download + docker build ${DOCKER_FLAGS} \ + --platform=linux/${ARCH} \ + --build-arg LOCAL_VERSION=${LOCAL_VERSION} \ + --build-arg COMPONENT_DIRECTORY="./" \ + -t ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} \ + -f ./Dockerfile . + + docker run -it --rm \ + --env-file ./.env \ + -v $(PWD)/config.yaml:/home/leapfrogai/config.yaml \ + -v $(PWD)/.model:/home/leapfrogai/.model \ + ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} diff --git a/packages/vllm/README.md b/packages/vllm/README.md index a55238cfd..5bc7a052f 100644 --- a/packages/vllm/README.md +++ b/packages/vllm/README.md @@ -16,13 +16,21 @@ See the LeapfrogAI documentation website for [system requirements](https://docs. The default model that comes with this backend in this repository's officially released images is a [4-bit quantization of the Synthia-7b model](https://huggingface.co/TheBloke/SynthIA-7B-v2.0-GPTQ). -You can optionally specify different models or quantization types using the following Docker build arguments: +All of the commands in this sub-section are executed within this `packages/vllm` sub-directory. -- `--build-arg HF_HUB_ENABLE_HF_TRANSFER="1"`: Enable or disable HuggingFace Hub transfer (default: 1) -- `--build-arg REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"`: HuggingFace repository ID for the model -- `--build-arg REVISION="gptq-4bit-32g-actorder_True"`: Revision or commit hash for the model -- `--build-arg QUANTIZATION="gptq"`: Quantization type (e.g., gptq, awq, or empty for un-quantized) -- `--build-arg TENSOR_PARALLEL_SIZE="1"`: The number of gpus to spread the tensor processing across +Optionally, you can specify a different model during Zarf creation: + +```bash +uds zarf package create --confirm --set MODEL_REPO_ID=defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g --set MODEL_REVISION=main +``` + +If you decide to use a different model, there will likely be a need to change generation and engine runtime configurations, please see the [Zarf Package Config](./zarf-config.yaml) and the [values override file](./values/upstream-values.yaml) for details on what runtime parameters can be modified. These parameters are model-specific, and can be found in the HuggingFace model cards and/or configuration files (e.g., prompt templates). + +For example, during Zarf deployment, you can override the Zarf Package Config defaults by doing the following: + +```bash +uds zarf package deploy zarf-package-vllm-amd64-dev.tar.zst --confirm --set ENFORCE_EAGER=True +``` ### Deployment @@ -39,11 +47,26 @@ uds zarf package deploy packages/vllm/zarf-package-vllm-*-dev.tar.zst --confirm ### Local Development -To run the vllm backend locally: +In local development the [config.yaml](./config.yaml) and [.env.example](./.env.example) must be modified if the model has changed away from the default. The LeapfrogAI SDK picks up the `config.yaml` automatically, and the `.env` must be sourced into the Python environment. > [!IMPORTANT] > Execute the following commands from this sub-directory +Create a `.env` file based on the [`.env.example`](./.env.example): + +```bash +cp .env.example .env +source .env +``` + +As necessary, modify the existing [`config.yaml`](./config.yaml): + +```bash +vim config.yaml +``` + +To run the vllm backend locally: + ```bash # Install dev and runtime dependencies make install @@ -54,3 +77,19 @@ python src/model_download.py # Start the model backend make dev ``` + +#### Local Docker Container + +To run the Docker container, use the following Makefile commands. `LOCAL_VERSION` must be consistent across the two Make commands. + +In the root of the LeapfrogAI repository: + +```bash +LOCAL_VERSION=dev make sdk-wheel +``` + +In the root of this vLLM sub-directory: + +```bash +LOCAL_VERSION=dev make docker +``` diff --git a/packages/vllm/chart/templates/deployment.yaml b/packages/vllm/chart/templates/deployment.yaml index 7b88cc137..3f8aa0540 100644 --- a/packages/vllm/chart/templates/deployment.yaml +++ b/packages/vllm/chart/templates/deployment.yaml @@ -36,7 +36,7 @@ spec: [ "sh", "-c", - 'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"', + 'while [ ! -f ###ZARF_CONST_MODEL_PATH###/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"', ] resources: {{- toYaml .Values.modelInjectionContainer.resources | nindent 12 }} @@ -46,6 +46,9 @@ spec: - name: leapfrogai-pv-storage persistentVolumeClaim: claimName: lfai-{{ .Values.nameOverride }}-pv-claim + - name: leapfrogai-sdk-configmap + configMap: + name: "{{ .Values.nameOverride }}-sdk-configmap" securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: @@ -58,6 +61,9 @@ spec: env: {{- toYaml . | nindent 12 }} {{- end }} + envFrom: + - configMapRef: + name: "{{ .Values.nameOverride }}-engine-configmap" ports: - name: http containerPort: {{ .Values.service.port }} @@ -67,6 +73,10 @@ spec: volumeMounts: - name: leapfrogai-pv-storage mountPath: "/data" + - name: leapfrogai-sdk-configmap + mountPath: "/home/leapfrogai/config.yaml" + subPath: "config.yaml" + readOnly: true {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml new file mode 100644 index 000000000..cdc08be5e --- /dev/null +++ b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Values.nameOverride }}-sdk-configmap" + namespace: {{ .Release.Namespace | default "leapfrogai" }} +data: + config.yaml: | + model: + source: {{ .Values.leapfrogaiConfig.model.source | quote }} + max_context_length: {{ .Values.leapfrogaiConfig.maxContextLength | quote }} + stop_tokens: + {{- $stopTokens := .Values.leapfrogaiConfig.stopTokens }} + {{- range $stopToken := splitList ", " .Values.leapfrogaiConfig.stopTokens }} + - {{ printf "%s" $stopToken }} + {{- end }} + prompt_format: + {{- with .Values.leapfrogaiConfig.promptFormat.chat }} + chat: + {{- if .system }} + system: {{ .system | quote }} + {{- end }} + {{- if .assistant }} + assistant: {{ .assistant | quote }} + {{- end }} + {{- if .user }} + user: {{ .user | quote }} + {{- end }} + {{- if .function }} + function: {{ .function | quote }} + {{- end }} + {{- end }} + defaults: + temperature: {{ .Values.leapfrogaiConfig.defaults.temperature | quote }} + top_p: {{ .Values.leapfrogaiConfig.defaults.topP | quote }} + top_k: {{ .Values.leapfrogaiConfig.defaults.topK | quote }} + repetition_penalty: {{ .Values.leapfrogaiConfig.defaults.repetitionPenalty | quote }} + max_new_tokens: {{ .Values.leapfrogaiConfig.defaults.maxNewTokens | quote }} diff --git a/packages/vllm/chart/templates/vllm-engine-configmap.yaml b/packages/vllm/chart/templates/vllm-engine-configmap.yaml new file mode 100644 index 000000000..5ac82b42c --- /dev/null +++ b/packages/vllm/chart/templates/vllm-engine-configmap.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Values.nameOverride }}-engine-configmap" + namespace: {{ .Release.Namespace | default "leapfrogai" }} +data: + VLLM_TRUST_REMOTE_CODE: "{{ .Values.vllmConfig.trustRemoteCode }}" + VLLM_TENSOR_PARALLEL_SIZE: "{{ .Values.vllmConfig.tensorParallelSize }}" + VLLM_ENFORCE_EAGER: "{{ .Values.vllmConfig.enforceEager }}" + VLLM_GPU_MEMORY_UTILIZATION: "{{ .Values.vllmConfig.gpuMemoryUtilization }}" + VLLM_WORKER_USE_RAY: "{{ .Values.vllmConfig.workerUseRay }}" + VLLM_ENGINE_USE_RAY: "{{ .Values.vllmConfig.engineUseRay }}" + VLLM_QUANTIZATION: "{{ .Values.vllmConfig.quantization }}" + VLLM_LOAD_FORMAT: "{{ .Values.vllmConfig.loadFormat }}" diff --git a/packages/vllm/chart/values.yaml b/packages/vllm/chart/values.yaml index 0f7fe9911..0209a8b34 100644 --- a/packages/vllm/chart/values.yaml +++ b/packages/vllm/chart/values.yaml @@ -13,6 +13,33 @@ image: nameOverride: "vllm" fullnameOverride: "" +leapfrogaiConfig: + model: + source: "/data/.model/" + maxContextLength: "32768" + stopTokens: ", <|im_end|>, <|endoftext|>" + promptFormat: + chat: + system: "SYSTEM: {}\n" + assistant: "ASSISTANT: {}\n" + user: "USER: {}\n" + defaults: + temperature: "0.1" + topP: "1.0" + topK: "0" + repetitionPenalty: "1.0" + maxNewTokens: "8192" + +vllmConfig: + trustRemoteCode: "True" + tensorParallelSize: "1" + enforceEager: "False" + gpuMemoryUtilization: "0.90" + workerUseRay: "True" + engineUseRay: "True" + quantization: "None" + loadFormat: "auto" + env: - name: LFAI_LOG_LEVEL value: "INFO" @@ -41,7 +68,7 @@ resources: limits: cpu: 0 memory: 0 - nvidia.com/gpu: 0 + nvidia.com/gpu: 1 requests: cpu: 0 memory: 0 diff --git a/packages/vllm/config.yaml b/packages/vllm/config.yaml new file mode 100644 index 000000000..22210a74b --- /dev/null +++ b/packages/vllm/config.yaml @@ -0,0 +1,17 @@ +model: + source: ".model/" +max_context_length: 32768 +stop_tokens: + - "<|im_end|>" + - "<|endoftext|>" + - "" +prompt_format: + chat: + system: "SYSTEM: {}\n" + assistant: "ASSISTANT: {}\n" + user: "USER: {}\n" +defaults: + top_p: 1.0 + top_k: 0 + repetition_penalty: 1.0 + max_new_tokens: 8192 diff --git a/packages/vllm/pyproject.toml b/packages/vllm/pyproject.toml index 4d7955708..24b1363e6 100644 --- a/packages/vllm/pyproject.toml +++ b/packages/vllm/pyproject.toml @@ -8,7 +8,7 @@ version = "0.13.1" dependencies = [ "pydantic == 2.8.2", - "vllm == 0.4.2", + "vllm == 0.4.3", "python-dotenv == 1.0.1", "aiostream ==0.6.2", "leapfrogai-sdk", diff --git a/packages/vllm/src/config.py b/packages/vllm/src/config.py index debca4ba3..c13af5521 100644 --- a/packages/vllm/src/config.py +++ b/packages/vllm/src/config.py @@ -5,10 +5,6 @@ class ConfigOptions(BaseConfig): - quantization: Literal[None, "awq", "gptq", "squeezellm"] = Field( - default=None, - description="Type of quantization, for un-quantized models omit this field", - ) tensor_parallel_size: int = Field( default=1, title="GPU Utilization Count", @@ -16,39 +12,105 @@ class ConfigOptions(BaseConfig): "This must be divisible to the number of attention heads in the model", examples=[1, 2, 3], ) + quantization: Literal[ + "aqlm", + "bitsandbytes", + "awq", + "deepspeedfp", + "fp8", + "marlin", + "gptq_marlin_24", + "gptq_marlin", + "gptq", + "squeezellm", + "sparseml", + "None", + "", + ] = Field( + title="quantization", + description="Quantization type of the model" + "Force GPTQ instead of GPTQ_Marlin by explicitly providing `gptq` as value.", + examples=["awq", "fp8", "gptq_marlin", "gptq", "squeezellm", "None"], + ) + load_format: Literal["auto", "safetensors", "npz", "pt", "bitsandbytes"] = Field( + title="quantization", + description="Load format for the type model and files", + examples=["auto", "safetensors", "npz", "pt", "bitsandbytes"], + ) + enforce_eager: bool = Field( + title="Enable Eager Mode", + description="Enable eager mode to start token generation immediately after prompt processing." + "Potentially reduces initial latency at the cost of slightly higher memory usage." + "Should be set to False in production environments with higher GPU memory.", + examples=[True, False], + ) + gpu_memory_utilization: float = Field( + title="GPU Memory Limit", + description="Maximum amount of GPU vRAM allocated to the vLLM engine and worker(s)", + examples=[0.50, 0.80, 0.90], + ) + engine_use_ray: bool = Field( + title="Use Ray for Engine", + description="If True, uses Ray for managing the execution engine. Allows for distributed inferencing in multi-node situations.", + examples=[True, False], + ) + worker_use_ray: bool = Field( + title="Use Ray for Worker", + description="If True, uses Ray for distributed worker management. Allows for distributed inferencing in multi-node situations.", + examples=[True, False], + ) + trust_remote_code: bool = Field( + title="Trust Downloaded Model Code", + description="Whether to trust inferencing code downloaded as part of the model download." + "Please review the Python code in the .model/ directory before trusting custom model code.", + examples=[True, False], + ) class DownloadOptions(BaseConfig): - hf_hub_enable_hf_transfer: Literal["0", "1"] = Field( - description="Option (0 - Disable, 1 - Enable) for faster transfers, tradeoff stability for faster speeds" - ) repo_id: str = Field( - description="HuggingFace repo id", + description="The HuggingFace git repository ID", examples=[ - "TheBloke/Synthia-7B-v2.0-GPTQ", - "migtissera/Synthia-MoE-v3-Mixtral-8x7B", - "microsoft/phi-2", + "defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g", + "justinthelaw/Phi-3-mini-128k-instruct-4bit-128g", ], ) revision: str = Field( - description="The model branch to use", + description="The HuggingFace repository git branch to use", examples=["main", "gptq-4bit-64g-actorder_True"], ) +# vLLM specific runtime configuration options class AppConfig(BaseConfig): backend_options: ConfigOptions + CONFIG_SOURCES = [ + EnvSource( + allow_all=True, + prefix="VLLM_", + remap={ + "tensor_parallel_size": "backend_options.tensor_parallel_size", + "trust_remote_code": "backend_options.trust_remote_code", + "enforce_eager": "backend_options.enforce_eager", + "quantization": "backend_options.quantization", + "gpu_memory_utilization": "backend_options.gpu_memory_utilization", + "worker_use_ray": "backend_options.worker_use_ray", + "engine_use_ray": "backend_options.engine_use_ray", + "load_format": "backend_options.load_format", + }, + ) + ] + + +class DownloadConfig(BaseConfig): download_options: Optional[DownloadOptions] CONFIG_SOURCES = [ EnvSource( allow_all=True, - prefix="LAI_", + prefix="LFAI_", remap={ - "hf_hub_enable_hf_transfer": "download_options.hf_hub_enable_hf_transfer", "repo_id": "download_options.repo_id", "revision": "download_options.revision", - "quantization": "backend_options.quantization", - "tensor_parallel_size": "backend_options.tensor_parallel_size", }, ) ] diff --git a/packages/vllm/src/main.py b/packages/vllm/src/main.py index 6a530e4f0..67d36d178 100644 --- a/packages/vllm/src/main.py +++ b/packages/vllm/src/main.py @@ -1,15 +1,12 @@ import asyncio -import json import logging import os import queue import random -import sys import threading import time from typing import Any, Dict, AsyncGenerator -from confz import EnvSource from dotenv import load_dotenv from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs @@ -18,15 +15,8 @@ from vllm.utils import random_uuid from config import AppConfig -from leapfrogai_sdk import ( - BackendConfig, - ChatCompletionRequest, - CompletionRequest, -) -from leapfrogai_sdk.llm import ( - GenerationConfig, - LLM, -) +from leapfrogai_sdk import BackendConfig +from leapfrogai_sdk.llm import GenerationConfig, LLM load_dotenv() @@ -84,60 +74,6 @@ def remove_iterator(self, async_iterable): pass # If the iterable is not found, ignore the error -def get_backend_configs(): - # Manually load env var as ConfZ does not handle complex types (list) - stop_tokens: str | None = os.getenv("LAI_STOP_TOKENS") - if stop_tokens: - processed_stop_tokens = json.loads(stop_tokens) - else: - processed_stop_tokens = [] - del os.environ["LAI_STOP_TOKENS"] - - env_source = EnvSource( - allow_all=True, - prefix="LAI_", - remap={ - "model_source": "model.source", - "max_context_length": "max_context_length", - "stop_tokens": "stop_tokens", - "prompt_format_chat_system": "prompt_format.chat.system", - "prompt_format_chat_assistant": "prompt_format.chat.assistant", - "prompt_format_chat_user": "prompt_format.chat.user", - "prompt_format_defaults_top_p": "prompt_format.defaults.top_p", - "prompt_format_defaults_top_k": "prompt_format.defaults.top_k", - }, - ) - - BackendConfig.CONFIG_SOURCES = env_source - # Initialize an immutable config from env variables without stop_tokens list - backend_configs: BackendConfig = BackendConfig() - # Updates "processed_stop_tokens" without triggering Pydantic validation errors - backend_configs.model_copy(update={"stop_tokens": processed_stop_tokens}) - - return backend_configs - - -def get_config_from_request(request: ChatCompletionRequest | CompletionRequest): - return GenerationConfig( - max_new_tokens=request.max_new_tokens, - temperature=request.temperature, - top_k=request.top_k, - top_p=request.top_p, - do_sample=request.do_sample, - n=request.n, - stop=list(request.stop), - repetition_penalty=request.repetition_penalty, - presence_penalty=request.presence_penalty, - best_of=str(request.best_of), - logit_bias=request.logit_bias, - return_full_text=request.return_full_text, - truncate=request.truncate, - typical_p=request.typical_p, - watermark=request.watermark, - seed=request.seed, - ) - - @LLM class Model: """Implements an LLM model with concurrent output generation and management.""" @@ -152,19 +88,26 @@ def __init__(self): _thread = threading.Thread(target=asyncio.run, args=(self.iterate_outputs(),)) _thread.start() - self.backend_config = get_backend_configs() - self.model = self.backend_config.model.source + quantization = ( + None + if AppConfig().backend_options.quantization in ["", "None"] + else AppConfig().backend_options.quantization + ) + self.engine_args = AsyncEngineArgs( - engine_use_ray=True, - model=self.model, - trust_remote_code=False, - quantization=AppConfig().backend_options.quantization, - max_seq_len_to_capture=self.backend_config.max_context_length, - max_model_len=self.backend_config.max_context_length, - dtype="auto", - worker_use_ray=True, - gpu_memory_utilization=0.90, + # Taken from the LFAI SDK general LLM configuration + model=BackendConfig().model.source, + max_seq_len_to_capture=BackendConfig().max_context_length, + max_model_len=BackendConfig().max_context_length, + # Taken from the vLLM-specific configuration + enforce_eager=AppConfig().backend_options.enforce_eager, + quantization=quantization, + load_format=AppConfig().backend_options.load_format, tensor_parallel_size=AppConfig().backend_options.tensor_parallel_size, + engine_use_ray=AppConfig().backend_options.engine_use_ray, + worker_use_ray=AppConfig().backend_options.worker_use_ray, + gpu_memory_utilization=AppConfig().backend_options.gpu_memory_utilization, + trust_remote_code=AppConfig().backend_options.trust_remote_code, ) self.engine = AsyncLLMEngine.from_engine_args(self.engine_args) print(self.engine_args) @@ -228,18 +171,39 @@ async def create_response( """Initiate a response generation for the given prompt and configuration, adding the result to the iterator pool.""" - sampling_params = SamplingParams( - temperature=config.temperature, - # Clamp top_p value to prevent float errors - top_p=clamp(config.top_p, 0.0 + sys.float_info.epsilon, 1.0), - # Restrict top_k to valid values, -1 disables top_k - top_k=config.top_k if config.top_k >= 1 else -1, - stop=self.backend_config.stop_tokens, - max_tokens=config.max_new_tokens, - skip_special_tokens=False, - ) + # Collect LeapfrogAI SDK-defined parameters not aligned with vLLM SamplingParams + params = { + "max_tokens": getattr(config, "max_new_tokens"), + } + + # Collect LeapfrogAI SDK-defined parameters directly aligned with vLLM SamplingParams + aligned_params = [ + "temperature", + "top_p", + "top_k", + "stop", + "n", + "repetition_penalty", + "presence_penalty", + "best_of", + "logit_bias", + "return_full_text", + "truncate", + "typical_p", + "seed", + ] + + # Add only the parameters that exist in the request + # vLLM will provide defaults for the rest, if not specified + for param in aligned_params: + if param in config: + params[param] = config[param] + + # Pass the collected params to vLLM SamplingParams + sampling_params = SamplingParams(**params) + logger.info(f"Begin generation for request {request_id}") - logger.debug(f"{request_id} sampling_paramms: {sampling_params}") + logger.debug(f"{request_id} sampling_params: {sampling_params}") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. @@ -284,8 +248,12 @@ async def generate( request_id ): result = "" - if not self.is_queue_empty(request_id): - result = self.delta_queue_by_id.get(request_id).get() + + # Ensure that the queue is not None and contains items before calling .get() + cur_queue = self.delta_queue_by_id.get(request_id) + if cur_queue is not None and not cur_queue.empty(): + result = cur_queue.get() + yield result logger.info(f"Finished request {request_id}") diff --git a/packages/vllm/src/model_download.py b/packages/vllm/src/model_download.py index 29f88942c..b87b6a61e 100644 --- a/packages/vllm/src/model_download.py +++ b/packages/vllm/src/model_download.py @@ -1,18 +1,17 @@ import os from huggingface_hub import snapshot_download -from config import AppConfig +from config import DownloadConfig -REPO_ID = AppConfig().download_options.repo_id -REVISION = AppConfig().download_options.revision -os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = ( - AppConfig().download_options.hf_hub_enable_hf_transfer -) +REPO_ID = DownloadConfig().download_options.repo_id +REVISION = DownloadConfig().download_options.revision + +# enable hf_transfer to max-out model download bandwidth +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" print(f"Downloading model from {REPO_ID} at revision {REVISION}...") snapshot_download( repo_id=REPO_ID, local_dir=".model", - local_dir_use_symlinks=False, revision=REVISION, ) diff --git a/packages/vllm/values/upstream-values.yaml b/packages/vllm/values/upstream-values.yaml index 0fe581bdd..e74ebec4a 100644 --- a/packages/vllm/values/upstream-values.yaml +++ b/packages/vllm/values/upstream-values.yaml @@ -2,12 +2,55 @@ image: repository: "ghcr.io/defenseunicorns/leapfrogai/vllm" tag: "###ZARF_CONST_IMAGE_VERSION###" +nameOverride: "###ZARF_CONST_NAME_OVERRIDE###" + +leapfrogaiConfig: + model: + source: "###ZARF_CONST_MODEL_PATH###" + maxContextLength: "###ZARF_VAR_MAX_CONTEXT_LENGTH###" + stopTokens: "###ZARF_VAR_STOP_TOKENS###" + promptFormat: + chat: + system: "###ZARF_VAR_PROMPT_FORMAT_CHAT_SYSTEM###" + assistant: "###ZARF_VAR_PROMPT_FORMAT_CHAT_ASSISTANT###" + user: "###ZARF_VAR_PROMPT_FORMAT_CHAT_USER###" + defaults: + temperature: "###ZARF_VAR_TEMPERATURE###" + topP: "###ZARF_VAR_TOP_P###" + topK: "###ZARF_VAR_TOP_K###" + repetitionPenalty: "###ZARF_VAR_REPETITION_PENALTY###" + maxNewTokens: "###ZARF_VAR_MAX_NEW_TOKENS###" + + +vllmConfig: + trustRemoteCode: "###ZARF_VAR_TRUST_REMOTE_CODE###" + tensorParallelSize: "###ZARF_VAR_TENSOR_PARALLEL_SIZE###" + enforceEager: "###ZARF_VAR_ENFORCE_EAGER###" + gpuMemoryUtilization: "###ZARF_VAR_GPU_MEMORY_UTILIZATION###" + workerUseRay: "###ZARF_VAR_WORKER_USE_RAY###" + engineUseRay: "###ZARF_VAR_ENGINE_USE_RAY###" + quantization: "###ZARF_VAR_QUANTIZATION###" + loadFormat: "###ZARF_VAR_LOAD_FORMAT###" + +env: + - name: LFAI_LOG_LEVEL + value: "INFO" + gpu: runtimeClassName: "###ZARF_VAR_GPU_RUNTIME###" resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. limits: + cpu: 0 + memory: 0 nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###" + requests: + cpu: 0 + memory: 0 persistence: size: "###ZARF_VAR_PVC_SIZE###" diff --git a/packages/vllm/zarf-config.yaml b/packages/vllm/zarf-config.yaml new file mode 100644 index 000000000..5f032eecb --- /dev/null +++ b/packages/vllm/zarf-config.yaml @@ -0,0 +1,39 @@ +package: + create: + set: + # x-release-please-start-version + image_version: "0.13.0" + # x-release-please-end + + model_repo_id: "TheBloke/Synthia-7B-v2.0-GPTQ" + model_revision: "gptq-4bit-32g-actorder_True" + model_path: "/data/.model/" + name_override: "vllm" + deploy: + set: + # vLLM runtime configuration (usually influenced by .env in local development) + trust_remote_code: "True" + tensor_parallel_size: "1" + enforce_eager: "False" + gpu_memory_utilization: "0.90" + worker_use_ray: "True" + engine_use_ray: "True" + quantization: "None" + load_format: "auto" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + max_context_length: "32768" + stop_tokens: ", <|im_end|>, <|endoftext|>" + prompt_format_chat_system: "SYSTEM: {}\n" + prompt_format_chat_user: "USER: {}\n" + prompt_format_chat_assistant: "ASSISTANT: {}\n" + temperature: "0.1" + top_p: "1.0" + top_k: "0" + repetition_penalty: "1.0" + max_new_tokens: "8192" + # Pod deployment configuration + gpu_limit: "1" + gpu_runtime: "nvidia" + pvc_size: "15Gi" + pvc_access_mode: "ReadWriteOnce" + pvc_storage_class: "local-path" diff --git a/packages/vllm/zarf.yaml b/packages/vllm/zarf.yaml index ed88c2f18..5e1733d17 100644 --- a/packages/vllm/zarf.yaml +++ b/packages/vllm/zarf.yaml @@ -9,27 +9,86 @@ metadata: constants: - name: IMAGE_VERSION value: "###ZARF_PKG_TMPL_IMAGE_VERSION###" + - name: MODEL_REPO_ID + description: "The HuggingFace repository ID" + value: "###ZARF_PKG_TMPL_MODEL_REPO_ID###" + - name: MODEL_REVISION + description: "The HuggingFace git branch or commit hash" + value: "###ZARF_PKG_TMPL_MODEL_REVISION###" + - name: MODEL_PATH + description: "Defines the location of the Zarf Injected model files in the vLLM container" + value: "###ZARF_PKG_TMPL_MODEL_PATH###" + - name: NAME_OVERRIDE + description: "Provide an override for the name of the deployment (e.g., the model name)" + value: "###ZARF_PKG_TMPL_NAME_OVERRIDE###" variables: + # vLLM runtime configuration (usually influenced by .env in local development) + - name: TRUST_REMOTE_CODE + description: "If True, allows the execution of code within the model files directory" + pattern: "^(True|False)$" + - name: TENSOR_PARALLEL_SIZE + description: "The number of tensor parallelism splits, typically used for model parallelism across GPUs" + pattern: "^[1-9][0-9]*$" + - name: ENFORCE_EAGER + description: "If set to True, enforces eager execution mode instead of lazy execution, impacting performance" + pattern: "^(True|False)$" + - name: GPU_MEMORY_UTILIZATION + description: "The fraction of GPU memory to be utilized, expressed as a decimal value between 0.01 and 0.99" + pattern: ^0\.(0[1-9]|[1-9][0-9])$ + - name: WORKER_USE_RAY + description: "If True, uses Ray for distributed worker management" + pattern: "^(True|False)$" + - name: ENGINE_USE_RAY + description: "If True, uses Ray for managing the execution engine" + pattern: "^(True|False)$" + - name: QUANTIZATION + description: "If None, allows vLLM to automatically detect via model files and configuration" + - name: LOAD_FORMAT + description: "If auto, allows vLLM to automatically detect via model files and configuration" + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) + - name: MAX_CONTEXT_LENGTH + description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used" + pattern: "^[1-9][0-9]*$" + - name: STOP_TOKENS + description: "A set of special tokens that signal the model to stop producing further output, delimited using a comma and space" + pattern: ^(<[^,]+>\s*,\s*)*<[^,]+>\s*$ + - name: PROMPT_FORMAT_CHAT_SYSTEM + description: "Prompt template format for the LeapfrogAI SDK to consume and wrap" + - name: PROMPT_FORMAT_CHAT_USER + description: "Prompt template format for the LeapfrogAI SDK to consume and wrap" + - name: PROMPT_FORMAT_CHAT_ASSISTANT + description: "Prompt template format for the LeapfrogAI SDK to consume and wrap" + - name: TEMPERATURE + description: "Controls the randomness of the model's output" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + - name: TOP_P + description: "The cumulative probability threshold for token sampling, where 1.0 represents no restriction" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + - name: TOP_K + description: "The number of top-K tokens to consider during sampling, where 0 disables top-K sampling" + pattern: ^\d+$ + - name: REPETITION_PENALTY + description: "The penalty value for repetition in generation" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + - name: MAX_NEW_TOKENS + description: "Maximum new tokens to generate" + pattern: ^\d+$ + # Pod deployment configuration - name: GPU_LIMIT - description: The GPU limit for the model inferencing. Must be 1 or more. - default: "1" + description: "The GPU limit for the model inferencing. Must be 1 or more." pattern: "^[1-9][0-9]*$" - name: GPU_RUNTIME - description: The GPU runtime name for the model inferencing. - default: "nvidia" + description: "The GPU runtime name for the model inferencing." pattern: "^(nvidia)?$" - name: PVC_SIZE - description: Size of the PVC used for model storage. - default: "15Gi" + description: "Size of the PVC used for model storage." pattern: "^[0-9]+[a-zA-Z]+$" - name: PVC_ACCESS_MODE - description: Access mode of the PVC used for model storage. - default: "ReadWriteOnce" + description: "Access mode of the PVC used for model storage." pattern: "^(ReadWriteOnce|ReadOnlyMany|ReadWriteMany)$" - name: PVC_STORAGE_CLASS - description: Storage class of the PVC used for model storage. - default: "local-path" + description: "Storage class of the PVC used for model storage." components: - name: vllm-model @@ -37,33 +96,33 @@ components: only: flavor: upstream charts: - - name: vllm-model + - name: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model" namespace: leapfrogai localPath: chart - releaseName: vllm-model + releaseName: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model" # x-release-please-start-version version: 0.13.1 # x-release-please-end valuesFiles: - "values/upstream-values.yaml" images: - - ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION### - - cgr.dev/chainguard/bash:latest + - "ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###" + - "cgr.dev/chainguard/bash:latest" dataInjections: - - source: .model/ + # location where locally downloaded model files are located + - source: ".model/" target: - namespace: leapfrogai - selector: app=lfai-vllm - container: data-loader - path: /data/.model + namespace: "leapfrogai" + selector: "app=lfai-###ZARF_PKG_TMPL_NAME_OVERRIDE###" + container: "data-loader" + # location in the container for injection of the model files + path: "###ZARF_PKG_TMPL_MODEL_PATH###" compress: true actions: onCreate: before: # NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed - - cmd: python src/model_download.py + - cmd: "python src/model_download.py" env: - - LAI_REPO_ID=TheBloke/Synthia-7B-v2.0-GPTQ - - LAI_REVISION=gptq-4bit-32g-actorder_True - - LAI_QUANTIZATION=gptq - - LAI_HF_HUB_ENABLE_HF_TRANSFER=1 + - LFAI_REPO_ID=###ZARF_PKG_TMPL_MODEL_REPO_ID### + - LFAI_REVISION=###ZARF_PKG_TMPL_MODEL_REVISION### diff --git a/src/leapfrogai_api/backend/grpc_client.py b/src/leapfrogai_api/backend/grpc_client.py index f9082fdc2..9d18d2951 100644 --- a/src/leapfrogai_api/backend/grpc_client.py +++ b/src/leapfrogai_api/backend/grpc_client.py @@ -63,7 +63,7 @@ async def completion(model: Model, request: lfai.CompletionRequest): CompletionChoice( index=0, text=response.choices[0].text, - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), logprobs=None, ) ], @@ -122,7 +122,7 @@ async def chat_completion(model: Model, request: lfai.ChatCompletionRequest): ).lower(), content=response.choices[0].chat_item.content, ), - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), ) ], usage=Usage( diff --git a/src/leapfrogai_api/backend/helpers.py b/src/leapfrogai_api/backend/helpers.py index 65a2fd0b5..005111601 100644 --- a/src/leapfrogai_api/backend/helpers.py +++ b/src/leapfrogai_api/backend/helpers.py @@ -39,7 +39,7 @@ async def recv_completion( index=0, text=c.choices[0].text, logprobs=None, - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), ) ], usage=Usage( @@ -77,7 +77,7 @@ async def recv_chat( delta=ChatDelta( role="assistant", content=c.choices[0].chat_item.content ), - finish_reason=finish_reason_enum.to_string(), + finish_reason=finish_reason_enum.to_finish_reason(), ) ], usage=Usage( diff --git a/src/leapfrogai_api/typedef/completion/completion_types.py b/src/leapfrogai_api/typedef/completion/completion_types.py index 9a5cdad95..f92d91f28 100644 --- a/src/leapfrogai_api/typedef/completion/completion_types.py +++ b/src/leapfrogai_api/typedef/completion/completion_types.py @@ -7,15 +7,48 @@ class FinishReason(Enum): - NONE = 0 # Maps to "None" - STOP = 1 # Maps to "stop" - LENGTH = 2 # Maps to "length" + NONE = 0 + STOP = 1 + LENGTH = 2 - def to_string(self) -> str | None: + def to_finish_reason(self) -> str | None: + """ + Convert the enum member to its corresponding finish reason string. + + Returns: + str | None: The finish reason as a lowercase string if it is not NONE; otherwise, None. + """ if self == FinishReason.NONE: return None return self.name.lower() + @classmethod + def _missing_(cls, value): + """ + Handle missing values when creating an enum instance. + + This method is called when a value passed to the enum constructor does not match any existing enum members. + It provides custom logic to map input values to enum members or raises an error if the value is invalid. + + Args: + value: The value that was not found among the enum members. + + Returns: + FinishReason: The corresponding enum member after applying custom mapping. + + Raises: + ValueError: If the value cannot be mapped to any enum member. + """ + # Handle custom value mappings + if value is None or value == "None": + return cls.NONE + elif value == "stop": + return cls.STOP + elif value == "length": + return cls.LENGTH + else: + raise ValueError(f"Invalid FinishReason value: {value}") + class CompletionChoice(BaseModel): """Choice object for completion."""