From 81ff6a3d7a714e0920bea0f75d47189c2f47fdb9 Mon Sep 17 00:00:00 2001
From: Gregory Horvath <gphorvath@defenseunicorns.com>
Date: Wed, 2 Oct 2024 22:24:37 -0400
Subject: [PATCH 1/4] feat(api): add prometheus monitoring (#1166)

Adds a python library for exposing basic fastapi metrics in prometheus syntax on /metrics endpoint
Add a policy to block access to the /metrics from outside the cluster
Adds a service monitor to api deployment so prometheus will scrape metrics
Redirect base url to `/docs` to prevent not found errors.
---
 packages/api/chart/templates/istio-admin.yaml | 24 +++++++++++++++++++
 packages/api/chart/templates/uds-package.yaml |  5 ++++
 src/leapfrogai_api/main.py                    | 18 +++++++++++++-
 src/leapfrogai_api/pyproject.toml             |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 packages/api/chart/templates/istio-admin.yaml

diff --git a/packages/api/chart/templates/istio-admin.yaml b/packages/api/chart/templates/istio-admin.yaml
new file mode 100644
index 000000000..c369e8786
--- /dev/null
+++ b/packages/api/chart/templates/istio-admin.yaml
@@ -0,0 +1,24 @@
+{{- if .Capabilities.APIVersions.Has "security.istio.io/v1beta1" }}
+apiVersion: security.istio.io/v1beta1
+kind: AuthorizationPolicy
+metadata:
+  name: api-block-metrics-access-from-public-gateway
+  namespace: {{ .Release.Namespace }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "chart.selectorLabels" . | nindent 6 }}
+  action: DENY
+  rules:
+    - to:
+        - operation:
+            ports:
+              - "8080"
+            paths:
+            - /metrics*
+      from:
+        - source:
+            notNamespaces:
+            - istio-admin-gateway
+            - monitoring
+{{- end }}
diff --git a/packages/api/chart/templates/uds-package.yaml b/packages/api/chart/templates/uds-package.yaml
index a6a83dea8..17220788d 100644
--- a/packages/api/chart/templates/uds-package.yaml
+++ b/packages/api/chart/templates/uds-package.yaml
@@ -7,6 +7,11 @@ metadata:
   labels:
     {{- include "chart.labels" . | nindent 4 }}
 spec:
+  monitor:
+    - portName: http
+      targetPort: {{ .Values.api.service.port }}
+      selector:
+        {{- include "chart.selectorLabels" . | nindent 8 }}
   network:
     expose:
       - service: {{ include "chart.fullname" . }}
diff --git a/src/leapfrogai_api/main.py b/src/leapfrogai_api/main.py
index f9b3682d4..108ccd51e 100644
--- a/src/leapfrogai_api/main.py
+++ b/src/leapfrogai_api/main.py
@@ -8,7 +8,7 @@
 from fastapi import FastAPI
 from fastapi.exception_handlers import request_validation_exception_handler
 from fastapi.exceptions import RequestValidationError
-
+from fastapi.responses import RedirectResponse
 from leapfrogai_api.routers.base import router as base_router
 from leapfrogai_api.routers.leapfrogai import auth
 from leapfrogai_api.routers.leapfrogai import models as lfai_models
@@ -30,6 +30,7 @@
     vector_stores,
 )
 from leapfrogai_api.utils import get_model_config
+from prometheus_fastapi_instrumentator import Instrumentator
 
 logging.basicConfig(
     level=os.getenv("LFAI_LOG_LEVEL", logging.INFO),
@@ -62,6 +63,21 @@ async def lifespan(app: FastAPI):
 app = FastAPI(lifespan=lifespan)
 
 
+@app.get("/", include_in_schema=False)
+async def root():
+    """Intercepts the root path and redirects to the API documentation."""
+    return RedirectResponse(url="/docs")
+
+
+Instrumentator(
+    excluded_handlers=["/healthz", "/metrics"],
+    should_group_status_codes=False,
+).instrument(app).expose(
+    app,
+    include_in_schema=False,
+)
+
+
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request, exc):
     logger.error(f"The client sent invalid data!: {exc}")
diff --git a/src/leapfrogai_api/pyproject.toml b/src/leapfrogai_api/pyproject.toml
index ea9b8f7e4..4542f7922 100644
--- a/src/leapfrogai_api/pyproject.toml
+++ b/src/leapfrogai_api/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "postgrest==0.16.11",                    # required by supabase, bug when using previous versions
     "openpyxl == 3.1.5",
     "psutil == 6.0.0",
+    "prometheus-fastapi-instrumentator == 7.0.0",
     "rerankers[flashrank] == 0.5.3"
 ]
 requires-python = "~=3.11"

From bb5e58c7ad4014d01a13986644e8fbbc8d665618 Mon Sep 17 00:00:00 2001
From: Gato <115658935+CollectiveUnicorn@users.noreply.github.com>
Date: Thu, 3 Oct 2024 08:54:57 -0700
Subject: [PATCH 2/4] fix(whisper): failing gpu pod due to mangled env variable

* Removes newlines that were causing the creation of an invalid environment variable entry.
* Resolves issue with failing gpu pod being unable to find the libcudnn_ops_infer.so file.
---
 packages/whisper/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/whisper/Dockerfile b/packages/whisper/Dockerfile
index b3bed054a..a5513e9fa 100644
--- a/packages/whisper/Dockerfile
+++ b/packages/whisper/Dockerfile
@@ -37,8 +37,8 @@ COPY --from=builder /leapfrogai/.venv/ /leapfrogai/.venv/
 
 # set the path to the cuda 11.8 dependencies
 ENV LD_LIBRARY_PATH \
-    /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\
-    /leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
+/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cublas/lib:\
+/leapfrogai/.venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
 
 COPY packages/whisper/main.py .
 

From 432b380da1355c5f0f6e9612d257a0dba0015a3c Mon Sep 17 00:00:00 2001
From: Justin Law <81255462+justinthelaw@users.noreply.github.com>
Date: Thu, 3 Oct 2024 12:05:50 -0400
Subject: [PATCH 3/4] fix: nightly snapshots repository typo (#1177)

---
 .github/workflows/nightly-snapshot-release.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/nightly-snapshot-release.yaml b/.github/workflows/nightly-snapshot-release.yaml
index 5025b6cae..da6abcdef 100644
--- a/.github/workflows/nightly-snapshot-release.yaml
+++ b/.github/workflows/nightly-snapshot-release.yaml
@@ -24,7 +24,7 @@ defaults:
 
 env:
   SNAPSHOT_VERSION: snapshot-latest
-  SNAPSHOT_SUB_REPOSITORY: /uds/snapshots
+  SNAPSHOT_SUB_REPOSITORY: /uds/snapshots/
 
 permissions:
   contents: read
@@ -170,6 +170,7 @@ jobs:
         env:
           ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }}
           SERVICE_KEY: ${{ steps.generate_secrets.outputs.SERVICE_KEY }}
+          LEAPFROGAI_MODEL: llama-cpp-python
         run: |
           python -m pytest -vvv -s ./tests/e2e
 

From fd3cbc4178fed7b29e474f05ab1712b1a9f70481 Mon Sep 17 00:00:00 2001
From: Justin Law <81255462+justinthelaw@users.noreply.github.com>
Date: Thu, 3 Oct 2024 12:07:20 -0400
Subject: [PATCH 4/4] feat(vllm)!: upgrade vllm backend and refactor deployment
 (#854)

### BREAKING CHANGES:

- moves all ENV specific to LeapfrogAI SDK to a ConfigMap using `volumeMount` for runtime injection and modification
  - in local dev, this is defined via `config.yaml`
- moves all ENV specific to vLLM to a ConfigMap, using `envFrom` for runtime injection and modification
  - in local dev, this is defined via `.env`
- `ZARF_CONFIG` is used to define create-time and deploy-time variables for (e.g., `MODEL_REPO_ID`, `ENFORCE_EAGER`)
  - updates Make targets and workflows with new `ZARF_CONFIG` variable
  - updates UDS bundles with new Zarf deployment variable overrides
  - allows delivery engineer's declarative definition of the backend configs and model
- re-introduces LFAI SDK `config.yaml` configuration method for local development and testing
- MUST upgrade API and backends together due to `FinishReason` proto change
---
 .github/actions/release/action.yaml           |   2 +-
 .github/workflows/e2e-vllm.yaml               |   2 +-
 Makefile                                      |   4 +-
 bundles/dev/gpu/uds-config.yaml               |  27 +++-
 bundles/latest/gpu/uds-config.yaml            |  27 +++-
 docs/DEVELOPMENT.md                           |   8 +-
 packages/vllm/.env.example                    |  25 ++-
 packages/vllm/Dockerfile                      |  54 ++-----
 packages/vllm/Makefile                        |  25 ++-
 packages/vllm/README.md                       |  53 ++++++-
 packages/vllm/chart/templates/deployment.yaml |  12 +-
 .../templates/leapfrogai-sdk-configmap.yaml   |  37 +++++
 .../templates/vllm-engine-configmap.yaml      |  14 ++
 packages/vllm/chart/values.yaml               |  29 +++-
 packages/vllm/config.yaml                     |  17 ++
 packages/vllm/pyproject.toml                  |   2 +-
 packages/vllm/src/config.py                   |  94 +++++++++--
 packages/vllm/src/main.py                     | 148 +++++++-----------
 packages/vllm/src/model_download.py           |  13 +-
 packages/vllm/values/upstream-values.yaml     |  43 +++++
 packages/vllm/zarf-config.yaml                |  39 +++++
 packages/vllm/zarf.yaml                       | 107 ++++++++++---
 src/leapfrogai_api/backend/grpc_client.py     |   4 +-
 src/leapfrogai_api/backend/helpers.py         |   4 +-
 .../typedef/completion/completion_types.py    |  41 ++++-
 25 files changed, 610 insertions(+), 221 deletions(-)
 create mode 100644 packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml
 create mode 100644 packages/vllm/chart/templates/vllm-engine-configmap.yaml
 create mode 100644 packages/vllm/config.yaml
 create mode 100644 packages/vllm/zarf-config.yaml

diff --git a/.github/actions/release/action.yaml b/.github/actions/release/action.yaml
index 63afd9b1d..38157f59e 100644
--- a/.github/actions/release/action.yaml
+++ b/.github/actions/release/action.yaml
@@ -138,7 +138,7 @@ runs:
       run: |
         docker buildx build --build-arg LOCAL_VERSION=${{ inputs.releaseTag }} -t ghcr.io/defenseunicorns/leapfrogai/vllm:${{ inputs.releaseTag }} --push -f packages/vllm/Dockerfile .
 
-        zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm
+        ZARF_CONFIG=packages/vllm/zarf-config.yaml zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm
 
         zarf package publish zarf-package-vllm-amd64-${{ inputs.releaseTag }}.tar.zst oci://ghcr.io/defenseunicorns/packages${{ inputs.subRepository }}leapfrogai
 
diff --git a/.github/workflows/e2e-vllm.yaml b/.github/workflows/e2e-vllm.yaml
index ace153006..6f89948ad 100644
--- a/.github/workflows/e2e-vllm.yaml
+++ b/.github/workflows/e2e-vllm.yaml
@@ -88,4 +88,4 @@ jobs:
         ##########
         - name: Build vLLM
           run: |
-            make build-vllm LOCAL_VERSION=e2e-test
+            make build-vllm LOCAL_VERSION=e2e-test ZARF_CONFIG=packages/vllm/zarf-config.yaml
diff --git a/Makefile b/Makefile
index ed74b5ccf..da9266246 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,7 @@ build-vllm: local-registry docker-vllm ## Build the vllm container and Zarf pack
 	docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm
+	ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm
 
 docker-text-embeddings: sdk-wheel
 	## Build the image (and tag it for the local registry)
@@ -263,7 +263,7 @@ silent-deploy-llama-cpp-python-package:
 silent-deploy-vllm-package:
 	@echo "Starting VLLM deployment..."
 	@mkdir -p .logs
-	@uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1
+	@ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1
 	@echo "VLLM deployment completed"
 
 silent-deploy-text-embeddings-package:
diff --git a/bundles/dev/gpu/uds-config.yaml b/bundles/dev/gpu/uds-config.yaml
index 9ad6cfdb4..1ef7a2634 100644
--- a/bundles/dev/gpu/uds-config.yaml
+++ b/bundles/dev/gpu/uds-config.yaml
@@ -9,8 +9,31 @@ variables:
     gpu_limit: 0  # runs on CPU until GPU limit is increased
 
   vllm:
-    gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
-    #tensor_parallel_size: 1   # TODO: reintroduce when vllm changes get pulled in
+    trust_remote_code: "True"
+    tensor_parallel_size: "1"
+    enforce_eager: "False"
+    gpu_memory_utilization: "0.90"
+    worker_use_ray: "True"
+    engine_use_ray: "True"
+    quantization: "None"
+    load_format: "auto"
+    # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+    max_context_length: "32768"
+    stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+    prompt_format_chat_system: "SYSTEM: {}\n"
+    prompt_format_chat_user: "USER: {}\n"
+    prompt_format_chat_assistant: "ASSISTANT: {}\n"
+    temperature: "0.1"
+    top_p: "1.0"
+    top_k: "0"
+    repetition_penalty: "1.0"
+    max_new_tokens: "8192"
+    # Pod deployment configuration
+    gpu_limit: "1"
+    gpu_runtime: "nvidia"
+    pvc_size: "15Gi"
+    pvc_access_mode: "ReadWriteOnce"
+    pvc_storage_class: "local-path"
 
   supabase:
     domain: "uds.dev"
diff --git a/bundles/latest/gpu/uds-config.yaml b/bundles/latest/gpu/uds-config.yaml
index 9ad6cfdb4..1ef7a2634 100644
--- a/bundles/latest/gpu/uds-config.yaml
+++ b/bundles/latest/gpu/uds-config.yaml
@@ -9,8 +9,31 @@ variables:
     gpu_limit: 0  # runs on CPU until GPU limit is increased
 
   vllm:
-    gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
-    #tensor_parallel_size: 1   # TODO: reintroduce when vllm changes get pulled in
+    trust_remote_code: "True"
+    tensor_parallel_size: "1"
+    enforce_eager: "False"
+    gpu_memory_utilization: "0.90"
+    worker_use_ray: "True"
+    engine_use_ray: "True"
+    quantization: "None"
+    load_format: "auto"
+    # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+    max_context_length: "32768"
+    stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+    prompt_format_chat_system: "SYSTEM: {}\n"
+    prompt_format_chat_user: "USER: {}\n"
+    prompt_format_chat_assistant: "ASSISTANT: {}\n"
+    temperature: "0.1"
+    top_p: "1.0"
+    top_k: "0"
+    repetition_penalty: "1.0"
+    max_new_tokens: "8192"
+    # Pod deployment configuration
+    gpu_limit: "1"
+    gpu_runtime: "nvidia"
+    pvc_size: "15Gi"
+    pvc_access_mode: "ReadWriteOnce"
+    pvc_storage_class: "local-path"
 
   supabase:
     domain: "uds.dev"
diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md
index 9fefb8a7e..98343ef7f 100644
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@@ -13,20 +13,20 @@ Please first see the pre-requisites listed on the LeapfrogAI documentation websi
 
 It is **_HIGHLY RECOMMENDED_** that PyEnv be installed on your machine, and a new virtual environment is created for every new development branch.
 
-Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.6:
+Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.9:
 
   ```bash
   # install the correct python version
-  pyenv install 3.11.6
+  pyenv install 3.11.9
 
   # create a new virtual environment named "leapfrogai"
-  pyenv virtualenv 3.11.6 leapfrogai
+  pyenv virtualenv 3.11.9 leapfrogai
 
   # activate the virtual environment
   pyenv activate leapfrogai
   ```
 
-If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.6:
+If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.9:
 
   ```bash
   sudo apt-get install build-essential zlib1g-dev libffi-dev \
diff --git a/packages/vllm/.env.example b/packages/vllm/.env.example
index 1e3a00170..0a995e234 100644
--- a/packages/vllm/.env.example
+++ b/packages/vllm/.env.example
@@ -1,13 +1,12 @@
-export LAI_HF_HUB_ENABLE_HF_TRANSFER="1"
-export LAI_REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"
-export LAI_REVISION="gptq-4bit-32g-actorder_True"
-export LAI_QUANTIZATION="gptq"
-export LAI_TENSOR_PARALLEL_SIZE=1
-export LAI_MODEL_SOURCE=".model/"
-export LAI_MAX_CONTEXT_LENGTH=32768
-export LAI_STOP_TOKENS='["</s>","<|endoftext|>","<|im_end|>"]'
-export LAI_PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_USER="USER: {}\n"
-export LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
-export LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
\ No newline at end of file
+LFAI_REPO_ID="TheBloke/SynthIA-7B-v2.0-GPTQ"
+LFAI_REVISION="gptq-4bit-32g-actorder_True"
+
+VLLM_TENSOR_PARALLEL_SIZE=1
+VLLM_TRUST_REMOTE_CODE=True
+VLLM_MAX_CONTEXT_LENGTH=32768
+VLLM_ENFORCE_EAGER=False
+VLLM_GPU_MEMORY_UTILIZATION=0.90
+VLLM_WORKER_USE_RAY=True
+VLLM_ENGINE_USE_RAY=True
+VLLM_QUANTIZATION=None
+VLLM_LOAD_FORMAT=auto
diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile
index 8676f5eda..f53088ead 100755
--- a/packages/vllm/Dockerfile
+++ b/packages/vllm/Dockerfile
@@ -6,8 +6,9 @@ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS builder
 # set SDK location
 # set the pyenv and Python versions
 ARG SDK_DEST=src/leapfrogai_sdk/build \
-    PYTHON_VERSION=3.11.6 \
-    PYENV_GIT_TAG=v2.4.8
+    PYTHON_VERSION=3.11.9 \
+    PYENV_GIT_TAG=v2.4.8\
+    COMPONENT_DIRECTORY="packages/vllm"
 
 # use root user for deps installation and nonroot user creation
 USER root
@@ -41,7 +42,7 @@ USER nonroot
 # copy-in SDK from sdk stage and vllm source code from host
 WORKDIR /home/leapfrogai
 COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
-COPY --chown=nonroot:nonroot packages/vllm packages/vllm
+COPY --chown=nonroot:nonroot ${COMPONENT_DIRECTORY} packages/vllm
 
 # create virtual environment for light-weight portability and minimal libraries
 RUN curl https://pyenv.run | bash && \
@@ -54,10 +55,10 @@ RUN curl https://pyenv.run | bash && \
 ENV PYENV_ROOT="/home/nonroot/.pyenv" \
     PATH="/home/nonroot/.pyenv/bin:$PATH"
 
-# Install Python 3.11.6, set it as global, and create a venv
+# Install Python, set it as global, and create a venv
 RUN . ~/.bashrc && \
-    PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.6 && \
-    pyenv global 3.11.6 && \
+    PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.9 && \
+    pyenv global ${PYTHON_VERSION} && \
     pyenv exec python -m venv .venv
 
 # set path to venv python
@@ -67,26 +68,15 @@ RUN rm -f packages/vllm/build/*.whl && \
     python -m pip wheel packages/vllm -w packages/vllm/build --find-links=${SDK_DEST} && \
     pip install packages/vllm/build/lfai_vllm*.whl --no-index --find-links=packages/vllm/build/
 
+#################
+# FINAL CONTAINER
+#################
+
 FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
 
 # set SDK location
 ARG SDK_DEST=src/leapfrogai_sdk/build
 
-# model-specific arguments
-ARG ARG HF_HUB_ENABLE_HF_TRANSFER="1" \
-    REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ" \
-    REVISION="gptq-4bit-32g-actorder_True" \
-    MODEL_SOURCE="/data/.model/" \
-    MAX_CONTEXT_LENGTH=32768 \
-    STOP_TOKENS='["</s>"]' \
-    PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n" \
-    PROMPT_FORMAT_CHAT_USER="USER: {}\n" \
-    PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n" \
-    PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \
-    PROMPT_FORMAT_DEFAULTS_TOP_K=0 \
-    TENSOR_PARALLEL_SIZE=1 \
-    QUANTIZATION="gptq"
-
 # setup nonroot user and permissions
 USER root
 RUN groupadd -g 65532 vglusers && \
@@ -101,24 +91,10 @@ COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/.venv /home/leapfrogai/.venv
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src /home/leapfrogai/packages/vllm/src
 # copy-in python binaries
-COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/
-
-# load ARG values into env variables for pickup by confz
-ENV LAI_HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER} \
-    LAI_REPO_ID=${REPO_ID} \
-    LAI_REVISION=${REVISION} \
-    LAI_MODEL_SOURCE=${MODEL_SOURCE} \
-    LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \
-    LAI_STOP_TOKENS=${STOP_TOKENS} \
-    LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \
-    LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \
-    LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \
-    LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \
-    LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \
-    LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \
-    LAI_QUANTIZATION=${QUANTIZATION} \
-    # remove vLLM callback to stats server
-    VLLM_NO_USAGE_STATS=1
+COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/ /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/
+
+# remove vLLM callback to stats server
+ENV VLLM_NO_USAGE_STATS=1
 
 ENV PATH="/home/leapfrogai/.venv/bin:$PATH"
 
diff --git a/packages/vllm/Makefile b/packages/vllm/Makefile
index 98e8b29db..c764a78f2 100644
--- a/packages/vllm/Makefile
+++ b/packages/vllm/Makefile
@@ -1,6 +1,27 @@
+ARCH ?= amd64
+LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
+DOCKER_FLAGS :=
+
 install:
 	python -m pip install ../../src/leapfrogai_sdk
 	python -m pip install -e ".[dev]"
 
-dev:
-	python -m leapfrogai_sdk.cli --app-dir=src/ main:Model
+download:
+	@env $$(cat .env | xargs) python src/model_download.py
+
+dev: download
+	@env $$(cat .env | xargs) python -m leapfrogai_sdk.cli --app-dir=src/ main:Model
+
+docker: download
+	docker build ${DOCKER_FLAGS} \
+		--platform=linux/${ARCH} \
+		--build-arg LOCAL_VERSION=${LOCAL_VERSION} \
+		--build-arg COMPONENT_DIRECTORY="./" \
+		-t ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} \
+		-f ./Dockerfile .
+
+	docker run -it --rm \
+		--env-file ./.env \
+		-v $(PWD)/config.yaml:/home/leapfrogai/config.yaml \
+		-v $(PWD)/.model:/home/leapfrogai/.model \
+		ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}
diff --git a/packages/vllm/README.md b/packages/vllm/README.md
index a55238cfd..5bc7a052f 100644
--- a/packages/vllm/README.md
+++ b/packages/vllm/README.md
@@ -16,13 +16,21 @@ See the LeapfrogAI documentation website for [system requirements](https://docs.
 
 The default model that comes with this backend in this repository's officially released images is a [4-bit quantization of the Synthia-7b model](https://huggingface.co/TheBloke/SynthIA-7B-v2.0-GPTQ).
 
-You can optionally specify different models or quantization types using the following Docker build arguments:
+All of the commands in this sub-section are executed within this `packages/vllm` sub-directory.
 
-- `--build-arg HF_HUB_ENABLE_HF_TRANSFER="1"`: Enable or disable HuggingFace Hub transfer (default: 1)
-- `--build-arg REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"`: HuggingFace repository ID for the model
-- `--build-arg REVISION="gptq-4bit-32g-actorder_True"`: Revision or commit hash for the model
-- `--build-arg QUANTIZATION="gptq"`: Quantization type (e.g., gptq, awq, or empty for un-quantized)
-- `--build-arg TENSOR_PARALLEL_SIZE="1"`: The number of gpus to spread the tensor processing across
+Optionally, you can specify a different model during Zarf creation:
+
+```bash
+uds zarf package create --confirm --set MODEL_REPO_ID=defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g --set MODEL_REVISION=main
+```
+
+If you decide to use a different model, there will likely be a need to change generation and engine runtime configurations, please see the [Zarf Package Config](./zarf-config.yaml) and the [values override file](./values/upstream-values.yaml) for details on what runtime parameters can be modified. These parameters are model-specific, and can be found in the HuggingFace model cards and/or configuration files (e.g., prompt templates).
+
+For example, during Zarf deployment, you can override the Zarf Package Config defaults by doing the following:
+
+```bash
+uds zarf package deploy zarf-package-vllm-amd64-dev.tar.zst --confirm --set ENFORCE_EAGER=True
+```
 
 ### Deployment
 
@@ -39,11 +47,26 @@ uds zarf package deploy packages/vllm/zarf-package-vllm-*-dev.tar.zst --confirm
 
 ### Local Development
 
-To run the vllm backend locally:
+In local development the [config.yaml](./config.yaml) and [.env.example](./.env.example) must be modified if the model has changed away from the default. The LeapfrogAI SDK picks up the `config.yaml` automatically, and the `.env` must be sourced into the Python environment.
 
 > [!IMPORTANT]
 > Execute the following commands from this sub-directory
 
+Create a `.env` file based on the [`.env.example`](./.env.example):
+
+```bash
+cp .env.example .env
+source .env
+```
+
+As necessary, modify the existing [`config.yaml`](./config.yaml):
+
+```bash
+vim config.yaml
+```
+
+To run the vllm backend locally:
+
 ```bash
 # Install dev and runtime dependencies
 make install
@@ -54,3 +77,19 @@ python src/model_download.py
 # Start the model backend
 make dev
 ```
+
+#### Local Docker Container
+
+To run the Docker container, use the following Makefile commands. `LOCAL_VERSION` must be consistent across the two Make commands.
+
+In the root of the LeapfrogAI repository:
+
+```bash
+LOCAL_VERSION=dev make sdk-wheel
+```
+
+In the root of this vLLM sub-directory:
+
+```bash
+LOCAL_VERSION=dev make docker
+```
diff --git a/packages/vllm/chart/templates/deployment.yaml b/packages/vllm/chart/templates/deployment.yaml
index 7b88cc137..3f8aa0540 100644
--- a/packages/vllm/chart/templates/deployment.yaml
+++ b/packages/vllm/chart/templates/deployment.yaml
@@ -36,7 +36,7 @@ spec:
             [
               "sh",
               "-c",
-              'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
+              'while [ ! -f ###ZARF_CONST_MODEL_PATH###/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
             ]
           resources:
             {{- toYaml .Values.modelInjectionContainer.resources | nindent 12 }}
@@ -46,6 +46,9 @@ spec:
         - name: leapfrogai-pv-storage
           persistentVolumeClaim:
             claimName: lfai-{{ .Values.nameOverride }}-pv-claim
+        - name: leapfrogai-sdk-configmap
+          configMap:
+            name: "{{ .Values.nameOverride }}-sdk-configmap"
       securityContext:
         {{- toYaml .Values.podSecurityContext | nindent 8 }}
       containers:
@@ -58,6 +61,9 @@ spec:
           env:
             {{- toYaml . | nindent 12 }}
           {{- end }}
+          envFrom:
+            - configMapRef:
+                name: "{{ .Values.nameOverride }}-engine-configmap"
           ports:
             - name: http
               containerPort: {{ .Values.service.port }}
@@ -67,6 +73,10 @@ spec:
           volumeMounts:
             - name: leapfrogai-pv-storage
               mountPath: "/data"
+            - name: leapfrogai-sdk-configmap
+              mountPath: "/home/leapfrogai/config.yaml"
+              subPath: "config.yaml"
+              readOnly: true
       {{- with .Values.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}
diff --git a/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml
new file mode 100644
index 000000000..cdc08be5e
--- /dev/null
+++ b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml
@@ -0,0 +1,37 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Values.nameOverride }}-sdk-configmap"
+  namespace: {{ .Release.Namespace | default "leapfrogai" }}
+data:
+  config.yaml: |
+    model:
+      source: {{ .Values.leapfrogaiConfig.model.source | quote }}
+    max_context_length: {{ .Values.leapfrogaiConfig.maxContextLength | quote }}
+    stop_tokens:
+      {{- $stopTokens := .Values.leapfrogaiConfig.stopTokens }}
+      {{- range $stopToken := splitList ", " .Values.leapfrogaiConfig.stopTokens }}
+      - {{ printf "%s" $stopToken }}
+      {{- end }}
+    prompt_format:
+    {{- with .Values.leapfrogaiConfig.promptFormat.chat }}
+      chat:
+      {{- if .system }}
+        system: {{ .system | quote }}
+      {{- end }}
+      {{- if .assistant }}
+        assistant: {{ .assistant | quote }}
+      {{- end }}
+      {{- if .user }}
+        user: {{ .user | quote }}
+      {{- end }}
+      {{- if .function }}
+        function: {{ .function | quote }}
+      {{- end }}
+    {{- end }}
+    defaults:
+      temperature: {{ .Values.leapfrogaiConfig.defaults.temperature | quote }}
+      top_p: {{ .Values.leapfrogaiConfig.defaults.topP | quote }}
+      top_k: {{ .Values.leapfrogaiConfig.defaults.topK | quote }}
+      repetition_penalty: {{ .Values.leapfrogaiConfig.defaults.repetitionPenalty | quote }}
+      max_new_tokens: {{ .Values.leapfrogaiConfig.defaults.maxNewTokens | quote }}
diff --git a/packages/vllm/chart/templates/vllm-engine-configmap.yaml b/packages/vllm/chart/templates/vllm-engine-configmap.yaml
new file mode 100644
index 000000000..5ac82b42c
--- /dev/null
+++ b/packages/vllm/chart/templates/vllm-engine-configmap.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Values.nameOverride }}-engine-configmap"
+  namespace: {{ .Release.Namespace | default "leapfrogai" }}
+data:
+  VLLM_TRUST_REMOTE_CODE: "{{ .Values.vllmConfig.trustRemoteCode }}"
+  VLLM_TENSOR_PARALLEL_SIZE: "{{ .Values.vllmConfig.tensorParallelSize }}"
+  VLLM_ENFORCE_EAGER: "{{ .Values.vllmConfig.enforceEager }}"
+  VLLM_GPU_MEMORY_UTILIZATION: "{{ .Values.vllmConfig.gpuMemoryUtilization }}"
+  VLLM_WORKER_USE_RAY: "{{ .Values.vllmConfig.workerUseRay }}"
+  VLLM_ENGINE_USE_RAY: "{{ .Values.vllmConfig.engineUseRay }}"
+  VLLM_QUANTIZATION: "{{ .Values.vllmConfig.quantization }}"
+  VLLM_LOAD_FORMAT: "{{ .Values.vllmConfig.loadFormat }}"
diff --git a/packages/vllm/chart/values.yaml b/packages/vllm/chart/values.yaml
index 0f7fe9911..0209a8b34 100644
--- a/packages/vllm/chart/values.yaml
+++ b/packages/vllm/chart/values.yaml
@@ -13,6 +13,33 @@ image:
 nameOverride: "vllm"
 fullnameOverride: ""
 
+leapfrogaiConfig:
+  model:
+    source: "/data/.model/"
+  maxContextLength: "32768"
+  stopTokens: "</s>, <|im_end|>, <|endoftext|>"
+  promptFormat:
+    chat:
+      system: "SYSTEM: {}\n"
+      assistant: "ASSISTANT: {}\n"
+      user: "USER: {}\n"
+  defaults:
+    temperature: "0.1"
+    topP: "1.0"
+    topK: "0"
+    repetitionPenalty: "1.0"
+    maxNewTokens: "8192"
+
+vllmConfig:
+  trustRemoteCode: "True"
+  tensorParallelSize: "1"
+  enforceEager: "False"
+  gpuMemoryUtilization: "0.90"
+  workerUseRay: "True"
+  engineUseRay: "True"
+  quantization: "None"
+  loadFormat: "auto"
+
 env:
   - name: LFAI_LOG_LEVEL
     value: "INFO"
@@ -41,7 +68,7 @@ resources:
   limits:
     cpu: 0
     memory: 0
-    nvidia.com/gpu: 0
+    nvidia.com/gpu: 1
   requests:
     cpu: 0
     memory: 0
diff --git a/packages/vllm/config.yaml b/packages/vllm/config.yaml
new file mode 100644
index 000000000..22210a74b
--- /dev/null
+++ b/packages/vllm/config.yaml
@@ -0,0 +1,17 @@
+model:
+  source: ".model/"
+max_context_length: 32768
+stop_tokens:
+  - "<|im_end|>"
+  - "<|endoftext|>"
+  - "</s>"
+prompt_format:
+  chat:
+    system: "SYSTEM: {}\n"
+    assistant: "ASSISTANT: {}\n"
+    user: "USER: {}\n"
+defaults:
+  top_p: 1.0
+  top_k: 0
+  repetition_penalty: 1.0
+  max_new_tokens: 8192
diff --git a/packages/vllm/pyproject.toml b/packages/vllm/pyproject.toml
index 4d7955708..24b1363e6 100644
--- a/packages/vllm/pyproject.toml
+++ b/packages/vllm/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.13.1"
 
 dependencies = [
     "pydantic == 2.8.2",
-    "vllm == 0.4.2",
+    "vllm == 0.4.3",
     "python-dotenv == 1.0.1",
     "aiostream ==0.6.2",
     "leapfrogai-sdk",
diff --git a/packages/vllm/src/config.py b/packages/vllm/src/config.py
index debca4ba3..c13af5521 100644
--- a/packages/vllm/src/config.py
+++ b/packages/vllm/src/config.py
@@ -5,10 +5,6 @@
 
 
 class ConfigOptions(BaseConfig):
-    quantization: Literal[None, "awq", "gptq", "squeezellm"] = Field(
-        default=None,
-        description="Type of quantization, for un-quantized models omit this field",
-    )
     tensor_parallel_size: int = Field(
         default=1,
         title="GPU Utilization Count",
@@ -16,39 +12,105 @@ class ConfigOptions(BaseConfig):
         "This must be divisible to the number of attention heads in the model",
         examples=[1, 2, 3],
     )
+    quantization: Literal[
+        "aqlm",
+        "bitsandbytes",
+        "awq",
+        "deepspeedfp",
+        "fp8",
+        "marlin",
+        "gptq_marlin_24",
+        "gptq_marlin",
+        "gptq",
+        "squeezellm",
+        "sparseml",
+        "None",
+        "",
+    ] = Field(
+        title="quantization",
+        description="Quantization type of the model"
+        "Force GPTQ instead of GPTQ_Marlin by explicitly providing `gptq` as value.",
+        examples=["awq", "fp8", "gptq_marlin", "gptq", "squeezellm", "None"],
+    )
+    load_format: Literal["auto", "safetensors", "npz", "pt", "bitsandbytes"] = Field(
+        title="quantization",
+        description="Load format for the type model and files",
+        examples=["auto", "safetensors", "npz", "pt", "bitsandbytes"],
+    )
+    enforce_eager: bool = Field(
+        title="Enable Eager Mode",
+        description="Enable eager mode to start token generation immediately after prompt processing."
+        "Potentially reduces initial latency at the cost of slightly higher memory usage."
+        "Should be set to False in production environments with higher GPU memory.",
+        examples=[True, False],
+    )
+    gpu_memory_utilization: float = Field(
+        title="GPU Memory Limit",
+        description="Maximum amount of GPU vRAM allocated to the vLLM engine and worker(s)",
+        examples=[0.50, 0.80, 0.90],
+    )
+    engine_use_ray: bool = Field(
+        title="Use Ray for Engine",
+        description="If True, uses Ray for managing the execution engine. Allows for distributed inferencing in multi-node situations.",
+        examples=[True, False],
+    )
+    worker_use_ray: bool = Field(
+        title="Use Ray for Worker",
+        description="If True, uses Ray for distributed worker management. Allows for distributed inferencing in multi-node situations.",
+        examples=[True, False],
+    )
+    trust_remote_code: bool = Field(
+        title="Trust Downloaded Model Code",
+        description="Whether to trust inferencing code downloaded as part of the model download."
+        "Please review the Python code in the .model/ directory before trusting custom model code.",
+        examples=[True, False],
+    )
 
 
 class DownloadOptions(BaseConfig):
-    hf_hub_enable_hf_transfer: Literal["0", "1"] = Field(
-        description="Option (0 - Disable, 1 - Enable) for faster transfers, tradeoff stability for faster speeds"
-    )
     repo_id: str = Field(
-        description="HuggingFace repo id",
+        description="The HuggingFace git repository ID",
         examples=[
-            "TheBloke/Synthia-7B-v2.0-GPTQ",
-            "migtissera/Synthia-MoE-v3-Mixtral-8x7B",
-            "microsoft/phi-2",
+            "defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g",
+            "justinthelaw/Phi-3-mini-128k-instruct-4bit-128g",
         ],
     )
     revision: str = Field(
-        description="The model branch to use",
+        description="The HuggingFace repository git branch to use",
         examples=["main", "gptq-4bit-64g-actorder_True"],
     )
 
 
+# vLLM specific runtime configuration options
 class AppConfig(BaseConfig):
     backend_options: ConfigOptions
+    CONFIG_SOURCES = [
+        EnvSource(
+            allow_all=True,
+            prefix="VLLM_",
+            remap={
+                "tensor_parallel_size": "backend_options.tensor_parallel_size",
+                "trust_remote_code": "backend_options.trust_remote_code",
+                "enforce_eager": "backend_options.enforce_eager",
+                "quantization": "backend_options.quantization",
+                "gpu_memory_utilization": "backend_options.gpu_memory_utilization",
+                "worker_use_ray": "backend_options.worker_use_ray",
+                "engine_use_ray": "backend_options.engine_use_ray",
+                "load_format": "backend_options.load_format",
+            },
+        )
+    ]
+
+
+class DownloadConfig(BaseConfig):
     download_options: Optional[DownloadOptions]
     CONFIG_SOURCES = [
         EnvSource(
             allow_all=True,
-            prefix="LAI_",
+            prefix="LFAI_",
             remap={
-                "hf_hub_enable_hf_transfer": "download_options.hf_hub_enable_hf_transfer",
                 "repo_id": "download_options.repo_id",
                 "revision": "download_options.revision",
-                "quantization": "backend_options.quantization",
-                "tensor_parallel_size": "backend_options.tensor_parallel_size",
             },
         )
     ]
diff --git a/packages/vllm/src/main.py b/packages/vllm/src/main.py
index 6a530e4f0..67d36d178 100644
--- a/packages/vllm/src/main.py
+++ b/packages/vllm/src/main.py
@@ -1,15 +1,12 @@
 import asyncio
-import json
 import logging
 import os
 import queue
 import random
-import sys
 import threading
 import time
 from typing import Any, Dict, AsyncGenerator
 
-from confz import EnvSource
 from dotenv import load_dotenv
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -18,15 +15,8 @@
 from vllm.utils import random_uuid
 
 from config import AppConfig
-from leapfrogai_sdk import (
-    BackendConfig,
-    ChatCompletionRequest,
-    CompletionRequest,
-)
-from leapfrogai_sdk.llm import (
-    GenerationConfig,
-    LLM,
-)
+from leapfrogai_sdk import BackendConfig
+from leapfrogai_sdk.llm import GenerationConfig, LLM
 
 load_dotenv()
 
@@ -84,60 +74,6 @@ def remove_iterator(self, async_iterable):
             pass  # If the iterable is not found, ignore the error
 
 
-def get_backend_configs():
-    # Manually load env var as ConfZ does not handle complex types (list)
-    stop_tokens: str | None = os.getenv("LAI_STOP_TOKENS")
-    if stop_tokens:
-        processed_stop_tokens = json.loads(stop_tokens)
-    else:
-        processed_stop_tokens = []
-    del os.environ["LAI_STOP_TOKENS"]
-
-    env_source = EnvSource(
-        allow_all=True,
-        prefix="LAI_",
-        remap={
-            "model_source": "model.source",
-            "max_context_length": "max_context_length",
-            "stop_tokens": "stop_tokens",
-            "prompt_format_chat_system": "prompt_format.chat.system",
-            "prompt_format_chat_assistant": "prompt_format.chat.assistant",
-            "prompt_format_chat_user": "prompt_format.chat.user",
-            "prompt_format_defaults_top_p": "prompt_format.defaults.top_p",
-            "prompt_format_defaults_top_k": "prompt_format.defaults.top_k",
-        },
-    )
-
-    BackendConfig.CONFIG_SOURCES = env_source
-    # Initialize an immutable config from env variables without stop_tokens list
-    backend_configs: BackendConfig = BackendConfig()
-    # Updates "processed_stop_tokens" without triggering Pydantic validation errors
-    backend_configs.model_copy(update={"stop_tokens": processed_stop_tokens})
-
-    return backend_configs
-
-
-def get_config_from_request(request: ChatCompletionRequest | CompletionRequest):
-    return GenerationConfig(
-        max_new_tokens=request.max_new_tokens,
-        temperature=request.temperature,
-        top_k=request.top_k,
-        top_p=request.top_p,
-        do_sample=request.do_sample,
-        n=request.n,
-        stop=list(request.stop),
-        repetition_penalty=request.repetition_penalty,
-        presence_penalty=request.presence_penalty,
-        best_of=str(request.best_of),
-        logit_bias=request.logit_bias,
-        return_full_text=request.return_full_text,
-        truncate=request.truncate,
-        typical_p=request.typical_p,
-        watermark=request.watermark,
-        seed=request.seed,
-    )
-
-
 @LLM
 class Model:
     """Implements an LLM model with concurrent output generation and management."""
@@ -152,19 +88,26 @@ def __init__(self):
         _thread = threading.Thread(target=asyncio.run, args=(self.iterate_outputs(),))
         _thread.start()
 
-        self.backend_config = get_backend_configs()
-        self.model = self.backend_config.model.source
+        quantization = (
+            None
+            if AppConfig().backend_options.quantization in ["", "None"]
+            else AppConfig().backend_options.quantization
+        )
+
         self.engine_args = AsyncEngineArgs(
-            engine_use_ray=True,
-            model=self.model,
-            trust_remote_code=False,
-            quantization=AppConfig().backend_options.quantization,
-            max_seq_len_to_capture=self.backend_config.max_context_length,
-            max_model_len=self.backend_config.max_context_length,
-            dtype="auto",
-            worker_use_ray=True,
-            gpu_memory_utilization=0.90,
+            # Taken from the LFAI SDK general LLM configuration
+            model=BackendConfig().model.source,
+            max_seq_len_to_capture=BackendConfig().max_context_length,
+            max_model_len=BackendConfig().max_context_length,
+            # Taken from the vLLM-specific configuration
+            enforce_eager=AppConfig().backend_options.enforce_eager,
+            quantization=quantization,
+            load_format=AppConfig().backend_options.load_format,
             tensor_parallel_size=AppConfig().backend_options.tensor_parallel_size,
+            engine_use_ray=AppConfig().backend_options.engine_use_ray,
+            worker_use_ray=AppConfig().backend_options.worker_use_ray,
+            gpu_memory_utilization=AppConfig().backend_options.gpu_memory_utilization,
+            trust_remote_code=AppConfig().backend_options.trust_remote_code,
         )
         self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
         print(self.engine_args)
@@ -228,18 +171,39 @@ async def create_response(
         """Initiate a response generation for the given prompt and configuration, adding the result to the iterator
         pool."""
 
-        sampling_params = SamplingParams(
-            temperature=config.temperature,
-            # Clamp top_p value to prevent float errors
-            top_p=clamp(config.top_p, 0.0 + sys.float_info.epsilon, 1.0),
-            # Restrict top_k to valid values, -1 disables top_k
-            top_k=config.top_k if config.top_k >= 1 else -1,
-            stop=self.backend_config.stop_tokens,
-            max_tokens=config.max_new_tokens,
-            skip_special_tokens=False,
-        )
+        # Collect LeapfrogAI SDK-defined parameters not aligned with vLLM SamplingParams
+        params = {
+            "max_tokens": getattr(config, "max_new_tokens"),
+        }
+
+        # Collect LeapfrogAI SDK-defined parameters directly aligned with vLLM SamplingParams
+        aligned_params = [
+            "temperature",
+            "top_p",
+            "top_k",
+            "stop",
+            "n",
+            "repetition_penalty",
+            "presence_penalty",
+            "best_of",
+            "logit_bias",
+            "return_full_text",
+            "truncate",
+            "typical_p",
+            "seed",
+        ]
+
+        # Add only the parameters that exist in the request
+        # vLLM will provide defaults for the rest, if not specified
+        for param in aligned_params:
+            if param in config:
+                params[param] = config[param]
+
+        # Pass the collected params to vLLM SamplingParams
+        sampling_params = SamplingParams(**params)
+
         logger.info(f"Begin generation for request {request_id}")
-        logger.debug(f"{request_id} sampling_paramms: {sampling_params}")
+        logger.debug(f"{request_id} sampling_params: {sampling_params}")
 
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
@@ -284,8 +248,12 @@ async def generate(
             request_id
         ):
             result = ""
-            if not self.is_queue_empty(request_id):
-                result = self.delta_queue_by_id.get(request_id).get()
+
+            # Ensure that the queue is not None and contains items before calling .get()
+            cur_queue = self.delta_queue_by_id.get(request_id)
+            if cur_queue is not None and not cur_queue.empty():
+                result = cur_queue.get()
+
             yield result
 
         logger.info(f"Finished request {request_id}")
diff --git a/packages/vllm/src/model_download.py b/packages/vllm/src/model_download.py
index 29f88942c..b87b6a61e 100644
--- a/packages/vllm/src/model_download.py
+++ b/packages/vllm/src/model_download.py
@@ -1,18 +1,17 @@
 import os
 from huggingface_hub import snapshot_download
-from config import AppConfig
+from config import DownloadConfig
 
-REPO_ID = AppConfig().download_options.repo_id
-REVISION = AppConfig().download_options.revision
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = (
-    AppConfig().download_options.hf_hub_enable_hf_transfer
-)
+REPO_ID = DownloadConfig().download_options.repo_id
+REVISION = DownloadConfig().download_options.revision
+
+# enable hf_transfer to max-out model download bandwidth
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 print(f"Downloading model from {REPO_ID} at revision {REVISION}...")
 
 snapshot_download(
     repo_id=REPO_ID,
     local_dir=".model",
-    local_dir_use_symlinks=False,
     revision=REVISION,
 )
diff --git a/packages/vllm/values/upstream-values.yaml b/packages/vllm/values/upstream-values.yaml
index 0fe581bdd..e74ebec4a 100644
--- a/packages/vllm/values/upstream-values.yaml
+++ b/packages/vllm/values/upstream-values.yaml
@@ -2,12 +2,55 @@ image:
   repository: "ghcr.io/defenseunicorns/leapfrogai/vllm"
   tag: "###ZARF_CONST_IMAGE_VERSION###"
 
+nameOverride: "###ZARF_CONST_NAME_OVERRIDE###"
+
+leapfrogaiConfig:
+  model:
+    source: "###ZARF_CONST_MODEL_PATH###"
+  maxContextLength: "###ZARF_VAR_MAX_CONTEXT_LENGTH###"
+  stopTokens: "###ZARF_VAR_STOP_TOKENS###"
+  promptFormat:
+    chat:
+      system: "###ZARF_VAR_PROMPT_FORMAT_CHAT_SYSTEM###"
+      assistant: "###ZARF_VAR_PROMPT_FORMAT_CHAT_ASSISTANT###"
+      user: "###ZARF_VAR_PROMPT_FORMAT_CHAT_USER###"
+  defaults:
+    temperature: "###ZARF_VAR_TEMPERATURE###"
+    topP: "###ZARF_VAR_TOP_P###"
+    topK: "###ZARF_VAR_TOP_K###"
+    repetitionPenalty: "###ZARF_VAR_REPETITION_PENALTY###"
+    maxNewTokens: "###ZARF_VAR_MAX_NEW_TOKENS###"
+
+
+vllmConfig:
+  trustRemoteCode: "###ZARF_VAR_TRUST_REMOTE_CODE###"
+  tensorParallelSize: "###ZARF_VAR_TENSOR_PARALLEL_SIZE###"
+  enforceEager: "###ZARF_VAR_ENFORCE_EAGER###"
+  gpuMemoryUtilization: "###ZARF_VAR_GPU_MEMORY_UTILIZATION###"
+  workerUseRay: "###ZARF_VAR_WORKER_USE_RAY###"
+  engineUseRay: "###ZARF_VAR_ENGINE_USE_RAY###"
+  quantization: "###ZARF_VAR_QUANTIZATION###"
+  loadFormat: "###ZARF_VAR_LOAD_FORMAT###"
+
+env:
+  - name: LFAI_LOG_LEVEL
+    value: "INFO"
+
 gpu:
   runtimeClassName: "###ZARF_VAR_GPU_RUNTIME###"
 
 resources:
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
   limits:
+    cpu: 0
+    memory: 0
     nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###"
+  requests:
+    cpu: 0
+    memory: 0
 
 persistence:
   size: "###ZARF_VAR_PVC_SIZE###"
diff --git a/packages/vllm/zarf-config.yaml b/packages/vllm/zarf-config.yaml
new file mode 100644
index 000000000..5f032eecb
--- /dev/null
+++ b/packages/vllm/zarf-config.yaml
@@ -0,0 +1,39 @@
+package:
+  create:
+    set:
+      # x-release-please-start-version
+      image_version: "0.13.0"
+      # x-release-please-end
+
+      model_repo_id: "TheBloke/Synthia-7B-v2.0-GPTQ"
+      model_revision: "gptq-4bit-32g-actorder_True"
+      model_path: "/data/.model/"
+      name_override: "vllm"
+  deploy:
+    set:
+      # vLLM runtime configuration (usually influenced by .env in local development)
+      trust_remote_code: "True"
+      tensor_parallel_size: "1"
+      enforce_eager: "False"
+      gpu_memory_utilization: "0.90"
+      worker_use_ray: "True"
+      engine_use_ray: "True"
+      quantization: "None"
+      load_format: "auto"
+      # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+      max_context_length: "32768"
+      stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+      prompt_format_chat_system: "SYSTEM: {}\n"
+      prompt_format_chat_user: "USER: {}\n"
+      prompt_format_chat_assistant: "ASSISTANT: {}\n"
+      temperature: "0.1"
+      top_p: "1.0"
+      top_k: "0"
+      repetition_penalty: "1.0"
+      max_new_tokens: "8192"
+      # Pod deployment configuration
+      gpu_limit: "1"
+      gpu_runtime: "nvidia"
+      pvc_size: "15Gi"
+      pvc_access_mode: "ReadWriteOnce"
+      pvc_storage_class: "local-path"
diff --git a/packages/vllm/zarf.yaml b/packages/vllm/zarf.yaml
index ed88c2f18..5e1733d17 100644
--- a/packages/vllm/zarf.yaml
+++ b/packages/vllm/zarf.yaml
@@ -9,27 +9,86 @@ metadata:
 constants:
   - name: IMAGE_VERSION
     value: "###ZARF_PKG_TMPL_IMAGE_VERSION###"
+  - name: MODEL_REPO_ID
+    description: "The HuggingFace repository ID"
+    value: "###ZARF_PKG_TMPL_MODEL_REPO_ID###"
+  - name: MODEL_REVISION
+    description: "The HuggingFace git branch or commit hash"
+    value: "###ZARF_PKG_TMPL_MODEL_REVISION###"
+  - name: MODEL_PATH
+    description: "Defines the location of the Zarf Injected model files in the vLLM container"
+    value: "###ZARF_PKG_TMPL_MODEL_PATH###"
+  - name: NAME_OVERRIDE
+    description: "Provide an override for the name of the deployment (e.g., the model name)"
+    value: "###ZARF_PKG_TMPL_NAME_OVERRIDE###"
 
 variables:
+  # vLLM runtime configuration (usually influenced by .env in local development)
+  - name: TRUST_REMOTE_CODE
+    description: "If True, allows the execution of code within the model files directory"
+    pattern: "^(True|False)$"
+  - name: TENSOR_PARALLEL_SIZE
+    description: "The number of tensor parallelism splits, typically used for model parallelism across GPUs"
+    pattern: "^[1-9][0-9]*$"
+  - name: ENFORCE_EAGER
+    description: "If set to True, enforces eager execution mode instead of lazy execution, impacting performance"
+    pattern: "^(True|False)$"
+  - name: GPU_MEMORY_UTILIZATION
+    description: "The fraction of GPU memory to be utilized, expressed as a decimal value between 0.01 and 0.99"
+    pattern: ^0\.(0[1-9]|[1-9][0-9])$
+  - name: WORKER_USE_RAY
+    description: "If True, uses Ray for distributed worker management"
+    pattern: "^(True|False)$"
+  - name: ENGINE_USE_RAY
+    description: "If True, uses Ray for managing the execution engine"
+    pattern: "^(True|False)$"
+  - name: QUANTIZATION
+    description: "If None, allows vLLM to automatically detect via model files and configuration"
+  - name: LOAD_FORMAT
+    description: "If auto, allows vLLM to automatically detect via model files and configuration"
+  # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+  - name: MAX_CONTEXT_LENGTH
+    description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used"
+    pattern: "^[1-9][0-9]*$"
+  - name: STOP_TOKENS
+    description: "A set of special tokens that signal the model to stop producing further output, delimited using a comma and space"
+    pattern: ^(<[^,]+>\s*,\s*)*<[^,]+>\s*$
+  - name: PROMPT_FORMAT_CHAT_SYSTEM
+    description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
+  - name: PROMPT_FORMAT_CHAT_USER
+    description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
+  - name: PROMPT_FORMAT_CHAT_ASSISTANT
+    description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
+  - name: TEMPERATURE
+    description: "Controls the randomness of the model's output"
+    pattern: ^(0(\.\d+)?|1(\.0+)?)$
+  - name: TOP_P
+    description: "The cumulative probability threshold for token sampling, where 1.0 represents no restriction"
+    pattern: ^(0(\.\d+)?|1(\.0+)?)$
+  - name: TOP_K
+    description: "The number of top-K tokens to consider during sampling, where 0 disables top-K sampling"
+    pattern: ^\d+$
+  - name: REPETITION_PENALTY
+    description: "The penalty value for repetition in generation"
+    pattern: ^(0(\.\d+)?|1(\.0+)?)$
+  - name: MAX_NEW_TOKENS
+    description: "Maximum new tokens to generate"
+    pattern: ^\d+$
+  # Pod deployment configuration
   - name: GPU_LIMIT
-    description: The GPU limit for the model inferencing. Must be 1 or more.
-    default: "1"
+    description: "The GPU limit for the model inferencing. Must be 1 or more."
     pattern: "^[1-9][0-9]*$"
   - name: GPU_RUNTIME
-    description: The GPU runtime name for the model inferencing.
-    default: "nvidia"
+    description: "The GPU runtime name for the model inferencing."
     pattern: "^(nvidia)?$"
   - name: PVC_SIZE
-    description: Size of the PVC used for model storage.
-    default: "15Gi"
+    description: "Size of the PVC used for model storage."
     pattern: "^[0-9]+[a-zA-Z]+$"
   - name: PVC_ACCESS_MODE
-    description: Access mode of the PVC used for model storage.
-    default: "ReadWriteOnce"
+    description: "Access mode of the PVC used for model storage."
     pattern: "^(ReadWriteOnce|ReadOnlyMany|ReadWriteMany)$"
   - name: PVC_STORAGE_CLASS
-    description: Storage class of the PVC used for model storage.
-    default: "local-path"
+    description: "Storage class of the PVC used for model storage."
 
 components:
   - name: vllm-model
@@ -37,33 +96,33 @@ components:
     only:
       flavor: upstream
     charts:
-      - name: vllm-model
+      - name: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model"
         namespace: leapfrogai
         localPath: chart
-        releaseName: vllm-model
+        releaseName: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model"
         # x-release-please-start-version
         version: 0.13.1
         # x-release-please-end
         valuesFiles:
           - "values/upstream-values.yaml"
     images:
-      - ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###
-      - cgr.dev/chainguard/bash:latest
+      - "ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###"
+      - "cgr.dev/chainguard/bash:latest"
     dataInjections:
-      - source: .model/
+      # location where locally downloaded model files are located
+      - source: ".model/"
         target:
-          namespace: leapfrogai
-          selector: app=lfai-vllm
-          container: data-loader
-          path: /data/.model
+          namespace: "leapfrogai"
+          selector: "app=lfai-###ZARF_PKG_TMPL_NAME_OVERRIDE###"
+          container: "data-loader"
+          # location in the container for injection of the model files
+          path: "###ZARF_PKG_TMPL_MODEL_PATH###"
         compress: true
     actions:
       onCreate:
         before:
           # NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed
-          - cmd: python src/model_download.py
+          - cmd: "python src/model_download.py"
             env:
-              - LAI_REPO_ID=TheBloke/Synthia-7B-v2.0-GPTQ
-              - LAI_REVISION=gptq-4bit-32g-actorder_True
-              - LAI_QUANTIZATION=gptq
-              - LAI_HF_HUB_ENABLE_HF_TRANSFER=1
+              - LFAI_REPO_ID=###ZARF_PKG_TMPL_MODEL_REPO_ID###
+              - LFAI_REVISION=###ZARF_PKG_TMPL_MODEL_REVISION###
diff --git a/src/leapfrogai_api/backend/grpc_client.py b/src/leapfrogai_api/backend/grpc_client.py
index f9082fdc2..9d18d2951 100644
--- a/src/leapfrogai_api/backend/grpc_client.py
+++ b/src/leapfrogai_api/backend/grpc_client.py
@@ -63,7 +63,7 @@ async def completion(model: Model, request: lfai.CompletionRequest):
                 CompletionChoice(
                     index=0,
                     text=response.choices[0].text,
-                    finish_reason=finish_reason_enum.to_string(),
+                    finish_reason=finish_reason_enum.to_finish_reason(),
                     logprobs=None,
                 )
             ],
@@ -122,7 +122,7 @@ async def chat_completion(model: Model, request: lfai.ChatCompletionRequest):
                         ).lower(),
                         content=response.choices[0].chat_item.content,
                     ),
-                    finish_reason=finish_reason_enum.to_string(),
+                    finish_reason=finish_reason_enum.to_finish_reason(),
                 )
             ],
             usage=Usage(
diff --git a/src/leapfrogai_api/backend/helpers.py b/src/leapfrogai_api/backend/helpers.py
index 65a2fd0b5..005111601 100644
--- a/src/leapfrogai_api/backend/helpers.py
+++ b/src/leapfrogai_api/backend/helpers.py
@@ -39,7 +39,7 @@ async def recv_completion(
                         index=0,
                         text=c.choices[0].text,
                         logprobs=None,
-                        finish_reason=finish_reason_enum.to_string(),
+                        finish_reason=finish_reason_enum.to_finish_reason(),
                     )
                 ],
                 usage=Usage(
@@ -77,7 +77,7 @@ async def recv_chat(
                         delta=ChatDelta(
                             role="assistant", content=c.choices[0].chat_item.content
                         ),
-                        finish_reason=finish_reason_enum.to_string(),
+                        finish_reason=finish_reason_enum.to_finish_reason(),
                     )
                 ],
                 usage=Usage(
diff --git a/src/leapfrogai_api/typedef/completion/completion_types.py b/src/leapfrogai_api/typedef/completion/completion_types.py
index 9a5cdad95..f92d91f28 100644
--- a/src/leapfrogai_api/typedef/completion/completion_types.py
+++ b/src/leapfrogai_api/typedef/completion/completion_types.py
@@ -7,15 +7,48 @@
 
 
 class FinishReason(Enum):
-    NONE = 0  # Maps to "None"
-    STOP = 1  # Maps to "stop"
-    LENGTH = 2  # Maps to "length"
+    NONE = 0
+    STOP = 1
+    LENGTH = 2
 
-    def to_string(self) -> str | None:
+    def to_finish_reason(self) -> str | None:
+        """
+        Convert the enum member to its corresponding finish reason string.
+
+        Returns:
+            str | None: The finish reason as a lowercase string if it is not NONE; otherwise, None.
+        """
         if self == FinishReason.NONE:
             return None
         return self.name.lower()
 
+    @classmethod
+    def _missing_(cls, value):
+        """
+        Handle missing values when creating an enum instance.
+
+        This method is called when a value passed to the enum constructor does not match any existing enum members.
+        It provides custom logic to map input values to enum members or raises an error if the value is invalid.
+
+        Args:
+            value: The value that was not found among the enum members.
+
+        Returns:
+            FinishReason: The corresponding enum member after applying custom mapping.
+
+        Raises:
+            ValueError: If the value cannot be mapped to any enum member.
+        """
+        # Handle custom value mappings
+        if value is None or value == "None":
+            return cls.NONE
+        elif value == "stop":
+            return cls.STOP
+        elif value == "length":
+            return cls.LENGTH
+        else:
+            raise ValueError(f"Invalid FinishReason value: {value}")
+
 
 class CompletionChoice(BaseModel):
     """Choice object for completion."""