Merge branch 'main' into renovate/actions-upload-artifact-4.x

defenseunicorns · Oct 4, 2024 · b9291a4 · b9291a4
2 parents 82c7271 + fd3cbc4
commit b9291a4
Show file tree

Hide file tree

Showing 31 changed files with 661 additions and 225 deletions.
diff --git a/.github/actions/release/action.yaml b/.github/actions/release/action.yaml
@@ -138,7 +138,7 @@ runs:
       run: |
         docker buildx build --build-arg LOCAL_VERSION=${{ inputs.releaseTag }} -t ghcr.io/defenseunicorns/leapfrogai/vllm:${{ inputs.releaseTag }} --push -f packages/vllm/Dockerfile .
 
-        zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm
+        ZARF_CONFIG=packages/vllm/zarf-config.yaml zarf package create packages/vllm --set=IMAGE_VERSION=${{ inputs.releaseTag }} --flavor upstream --confirm
 
         zarf package publish zarf-package-vllm-amd64-${{ inputs.releaseTag }}.tar.zst oci://ghcr.io/defenseunicorns/packages${{ inputs.subRepository }}leapfrogai
 

diff --git a/.github/workflows/e2e-vllm.yaml b/.github/workflows/e2e-vllm.yaml
@@ -88,4 +88,4 @@ jobs:
         ##########
         - name: Build vLLM
           run: |
-            make build-vllm LOCAL_VERSION=e2e-test
+            make build-vllm LOCAL_VERSION=e2e-test ZARF_CONFIG=packages/vllm/zarf-config.yaml
diff --git a/.github/workflows/nightly-snapshot-release.yaml b/.github/workflows/nightly-snapshot-release.yaml
@@ -24,7 +24,7 @@ defaults:
 
 env:
   SNAPSHOT_VERSION: snapshot-latest
-  SNAPSHOT_SUB_REPOSITORY: /uds/snapshots
+  SNAPSHOT_SUB_REPOSITORY: /uds/snapshots/
 
 permissions:
   contents: read
@@ -170,6 +170,7 @@ jobs:
         env:
           ANON_KEY: ${{ steps.generate_secrets.outputs.ANON_KEY }}
           SERVICE_KEY: ${{ steps.generate_secrets.outputs.SERVICE_KEY }}
+          LEAPFROGAI_MODEL: llama-cpp-python
         run: |
           python -m pytest -vvv -s ./tests/e2e
 

diff --git a/Makefile b/Makefile
@@ -123,7 +123,7 @@ build-vllm: local-registry docker-vllm ## Build the vllm container and Zarf pack
 	docker push ${DOCKER_FLAGS} localhost:${REG_PORT}/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}
 
 	## Build the Zarf package
-	uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm
+	ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package create packages/vllm --flavor ${FLAVOR} -a ${ARCH} -o packages/vllm --registry-override=ghcr.io=localhost:${REG_PORT} --insecure --set IMAGE_VERSION=${LOCAL_VERSION} ${ZARF_FLAGS} --confirm
 
 docker-text-embeddings: sdk-wheel
 	## Build the image (and tag it for the local registry)
@@ -263,7 +263,7 @@ silent-deploy-llama-cpp-python-package:
 silent-deploy-vllm-package:
 	@echo "Starting VLLM deployment..."
 	@mkdir -p .logs
-	@uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1
+	@ZARF_CONFIG=packages/vllm/zarf-config.yaml uds zarf package deploy packages/vllm/zarf-package-vllm-${ARCH}-${LOCAL_VERSION}.tar.zst ${ZARF_FLAGS} --confirm > .logs/deploy-vllm.log 2>&1
 	@echo "VLLM deployment completed"
 
 silent-deploy-text-embeddings-package:

diff --git a/bundles/dev/gpu/uds-config.yaml b/bundles/dev/gpu/uds-config.yaml
@@ -9,8 +9,31 @@ variables:
     gpu_limit: 0  # runs on CPU until GPU limit is increased
 
   vllm:
-    gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
-    #tensor_parallel_size: 1   # TODO: reintroduce when vllm changes get pulled in
+    trust_remote_code: "True"
+    tensor_parallel_size: "1"
+    enforce_eager: "False"
+    gpu_memory_utilization: "0.90"
+    worker_use_ray: "True"
+    engine_use_ray: "True"
+    quantization: "None"
+    load_format: "auto"
+    # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+    max_context_length: "32768"
+    stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+    prompt_format_chat_system: "SYSTEM: {}\n"
+    prompt_format_chat_user: "USER: {}\n"
+    prompt_format_chat_assistant: "ASSISTANT: {}\n"
+    temperature: "0.1"
+    top_p: "1.0"
+    top_k: "0"
+    repetition_penalty: "1.0"
+    max_new_tokens: "8192"
+    # Pod deployment configuration
+    gpu_limit: "1"
+    gpu_runtime: "nvidia"
+    pvc_size: "15Gi"
+    pvc_access_mode: "ReadWriteOnce"
+    pvc_storage_class: "local-path"
 
   supabase:
     domain: "uds.dev"

diff --git a/bundles/latest/gpu/uds-config.yaml b/bundles/latest/gpu/uds-config.yaml
@@ -9,8 +9,31 @@ variables:
     gpu_limit: 0  # runs on CPU until GPU limit is increased
 
   vllm:
-    gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
-    #tensor_parallel_size: 1   # TODO: reintroduce when vllm changes get pulled in
+    trust_remote_code: "True"
+    tensor_parallel_size: "1"
+    enforce_eager: "False"
+    gpu_memory_utilization: "0.90"
+    worker_use_ray: "True"
+    engine_use_ray: "True"
+    quantization: "None"
+    load_format: "auto"
+    # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
+    max_context_length: "32768"
+    stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
+    prompt_format_chat_system: "SYSTEM: {}\n"
+    prompt_format_chat_user: "USER: {}\n"
+    prompt_format_chat_assistant: "ASSISTANT: {}\n"
+    temperature: "0.1"
+    top_p: "1.0"
+    top_k: "0"
+    repetition_penalty: "1.0"
+    max_new_tokens: "8192"
+    # Pod deployment configuration
+    gpu_limit: "1"
+    gpu_runtime: "nvidia"
+    pvc_size: "15Gi"
+    pvc_access_mode: "ReadWriteOnce"
+    pvc_storage_class: "local-path"
 
   supabase:
     domain: "uds.dev"

diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md
@@ -13,20 +13,20 @@ Please first see the pre-requisites listed on the LeapfrogAI documentation websi
 
 It is **_HIGHLY RECOMMENDED_** that PyEnv be installed on your machine, and a new virtual environment is created for every new development branch.
 
-Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.6:
+Follow the installation instructions outlined in the [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) repository to install Python 3.11.9:
 
   ```bash
   # install the correct python version
-  pyenv install 3.11.6
+  pyenv install 3.11.9
 
   # create a new virtual environment named "leapfrogai"
-  pyenv virtualenv 3.11.6 leapfrogai
+  pyenv virtualenv 3.11.9 leapfrogai
 
   # activate the virtual environment
   pyenv activate leapfrogai
   ```
 
-If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.6:
+If your installation process completes successfully but indicates missing packages such as `sqlite3`, execute the following command to install the required packages then proceed with the reinstallation of Python 3.11.9:
 
   ```bash
   sudo apt-get install build-essential zlib1g-dev libffi-dev \

diff --git a/packages/api/chart/templates/istio-admin.yaml b/packages/api/chart/templates/istio-admin.yaml
@@ -0,0 +1,24 @@
+{{- if .Capabilities.APIVersions.Has "security.istio.io/v1beta1" }}
+apiVersion: security.istio.io/v1beta1
+kind: AuthorizationPolicy
+metadata:
+  name: api-block-metrics-access-from-public-gateway
+  namespace: {{ .Release.Namespace }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "chart.selectorLabels" . | nindent 6 }}
+  action: DENY
+  rules:
+    - to:
+        - operation:
+            ports:
+              - "8080"
+            paths:
+            - /metrics*
+      from:
+        - source:
+            notNamespaces:
+            - istio-admin-gateway
+            - monitoring
+{{- end }}
diff --git a/packages/api/chart/templates/uds-package.yaml b/packages/api/chart/templates/uds-package.yaml
@@ -7,6 +7,11 @@ metadata:
   labels:
     {{- include "chart.labels" . | nindent 4 }}
 spec:
+  monitor:
+    - portName: http
+      targetPort: {{ .Values.api.service.port }}
+      selector:
+        {{- include "chart.selectorLabels" . | nindent 8 }}
   network:
     expose:
       - service: {{ include "chart.fullname" . }}

diff --git a/packages/vllm/.env.example b/packages/vllm/.env.example
@@ -1,13 +1,12 @@
-export LAI_HF_HUB_ENABLE_HF_TRANSFER="1"
-export LAI_REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"
-export LAI_REVISION="gptq-4bit-32g-actorder_True"
-export LAI_QUANTIZATION="gptq"
-export LAI_TENSOR_PARALLEL_SIZE=1
-export LAI_MODEL_SOURCE=".model/"
-export LAI_MAX_CONTEXT_LENGTH=32768
-export LAI_STOP_TOKENS='["</s>","<|endoftext|>","<|im_end|>"]'
-export LAI_PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_USER="USER: {}\n"
-export LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
-export LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
+LFAI_REPO_ID="TheBloke/SynthIA-7B-v2.0-GPTQ"
+LFAI_REVISION="gptq-4bit-32g-actorder_True"
+
+VLLM_TENSOR_PARALLEL_SIZE=1
+VLLM_TRUST_REMOTE_CODE=True
+VLLM_MAX_CONTEXT_LENGTH=32768
+VLLM_ENFORCE_EAGER=False
+VLLM_GPU_MEMORY_UTILIZATION=0.90
+VLLM_WORKER_USE_RAY=True
+VLLM_ENGINE_USE_RAY=True
+VLLM_QUANTIZATION=None
+VLLM_LOAD_FORMAT=auto
diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile
@@ -6,8 +6,9 @@ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS builder
 # set SDK location
 # set the pyenv and Python versions
 ARG SDK_DEST=src/leapfrogai_sdk/build \
-    PYTHON_VERSION=3.11.6 \
-    PYENV_GIT_TAG=v2.4.8
+    PYTHON_VERSION=3.11.9 \
+    PYENV_GIT_TAG=v2.4.8\
+    COMPONENT_DIRECTORY="packages/vllm"
 
 # use root user for deps installation and nonroot user creation
 USER root
@@ -41,7 +42,7 @@ USER nonroot
 # copy-in SDK from sdk stage and vllm source code from host
 WORKDIR /home/leapfrogai
 COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
-COPY --chown=nonroot:nonroot packages/vllm packages/vllm
+COPY --chown=nonroot:nonroot ${COMPONENT_DIRECTORY} packages/vllm
 
 # create virtual environment for light-weight portability and minimal libraries
 RUN curl https://pyenv.run | bash && \
@@ -54,10 +55,10 @@ RUN curl https://pyenv.run | bash && \
 ENV PYENV_ROOT="/home/nonroot/.pyenv" \
     PATH="/home/nonroot/.pyenv/bin:$PATH"
 
-# Install Python 3.11.6, set it as global, and create a venv
+# Install Python, set it as global, and create a venv
 RUN . ~/.bashrc && \
-    PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.6 && \
-    pyenv global 3.11.6 && \
+    PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.9 && \
+    pyenv global ${PYTHON_VERSION} && \
     pyenv exec python -m venv .venv
 
 # set path to venv python
@@ -67,26 +68,15 @@ RUN rm -f packages/vllm/build/*.whl && \
     python -m pip wheel packages/vllm -w packages/vllm/build --find-links=${SDK_DEST} && \
     pip install packages/vllm/build/lfai_vllm*.whl --no-index --find-links=packages/vllm/build/
 
+#################
+# FINAL CONTAINER
+#################
+
 FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
 
 # set SDK location
 ARG SDK_DEST=src/leapfrogai_sdk/build
 
-# model-specific arguments
-ARG ARG HF_HUB_ENABLE_HF_TRANSFER="1" \
-    REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ" \
-    REVISION="gptq-4bit-32g-actorder_True" \
-    MODEL_SOURCE="/data/.model/" \
-    MAX_CONTEXT_LENGTH=32768 \
-    STOP_TOKENS='["</s>"]' \
-    PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n" \
-    PROMPT_FORMAT_CHAT_USER="USER: {}\n" \
-    PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n" \
-    PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \
-    PROMPT_FORMAT_DEFAULTS_TOP_K=0 \
-    TENSOR_PARALLEL_SIZE=1 \
-    QUANTIZATION="gptq"
-
 # setup nonroot user and permissions
 USER root
 RUN groupadd -g 65532 vglusers && \
@@ -101,24 +91,10 @@ COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/.venv /home/leapfrogai/.venv
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src /home/leapfrogai/packages/vllm/src
 # copy-in python binaries
-COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/
-
-# load ARG values into env variables for pickup by confz
-ENV LAI_HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER} \
-    LAI_REPO_ID=${REPO_ID} \
-    LAI_REVISION=${REVISION} \
-    LAI_MODEL_SOURCE=${MODEL_SOURCE} \
-    LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \
-    LAI_STOP_TOKENS=${STOP_TOKENS} \
-    LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \
-    LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \
-    LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \
-    LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \
-    LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \
-    LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \
-    LAI_QUANTIZATION=${QUANTIZATION} \
-    # remove vLLM callback to stats server
-    VLLM_NO_USAGE_STATS=1
+COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/ /home/nonroot/.pyenv/versions/${PYTHON_VERSION}/
+
+# remove vLLM callback to stats server
+ENV VLLM_NO_USAGE_STATS=1
 
 ENV PATH="/home/leapfrogai/.venv/bin:$PATH"
 

diff --git a/packages/vllm/Makefile b/packages/vllm/Makefile
@@ -1,6 +1,27 @@
+ARCH ?= amd64
+LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
+DOCKER_FLAGS :=
+
 install:
 	python -m pip install ../../src/leapfrogai_sdk
 	python -m pip install -e ".[dev]"
 
-dev:
-	python -m leapfrogai_sdk.cli --app-dir=src/ main:Model
+download:
+	@env $$(cat .env | xargs) python src/model_download.py
+
+dev: download
+	@env $$(cat .env | xargs) python -m leapfrogai_sdk.cli --app-dir=src/ main:Model
+
+docker: download
+	docker build ${DOCKER_FLAGS} \
+		--platform=linux/${ARCH} \
+		--build-arg LOCAL_VERSION=${LOCAL_VERSION} \
+		--build-arg COMPONENT_DIRECTORY="./" \
+		-t ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION} \
+		-f ./Dockerfile .
+
+	docker run -it --rm \
+		--env-file ./.env \
+		-v $(PWD)/config.yaml:/home/leapfrogai/config.yaml \
+		-v $(PWD)/.model:/home/leapfrogai/.model \
+		ghcr.io/defenseunicorns/leapfrogai/vllm:${LOCAL_VERSION}