From e19f892c998efa865d99bc39b37bef51316ceb06 Mon Sep 17 00:00:00 2001 From: Justin Law Date: Mon, 16 Sep 2024 14:12:15 -0400 Subject: [PATCH] fix some env sourcing --- packages/vllm/.env.example | 29 +++++++++++------------------ packages/vllm/README.md | 8 ++++++++ packages/vllm/config.yaml | 17 +++++++++++++++++ packages/vllm/src/config.py | 3 ++- packages/vllm/src/main.py | 7 ------- packages/vllm/zarf.yaml | 8 ++++---- 6 files changed, 42 insertions(+), 30 deletions(-) create mode 100644 packages/vllm/config.yaml diff --git a/packages/vllm/.env.example b/packages/vllm/.env.example index 4b49e6f64..f1dffafb8 100644 --- a/packages/vllm/.env.example +++ b/packages/vllm/.env.example @@ -1,18 +1,11 @@ -LFAI_HF_HUB_ENABLE_HF_TRANSFER="1" -LFAI_REPO_ID="defenseunicotns/Hermes-2-Pro-Mistral-7B-4bit-32g" -LFAI_REVISION="main" -LFAI_TENSOR_PARALLEL_SIZE=1 -LFAI_TRUST_REMOTE_CODE=True -LFAI_MODEL_SOURCE=".model/" -LFAI_MAX_CONTEXT_LENGTH=32768 -LFAI_STOP_TOKENS='[""]' -LFAI_PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n" -LFAI_PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n" -LFAI_PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n" -LFAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 -LFAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0 -LFAI_ENFORCE_EAGER=False -LFAI_GPU_MEMORY_UTILIZATION=0.90 -LFAI_WORKER_USE_RAY=True -LFAI_ENGINE_USE_RAY=True -LFAI_QUANTIZATION="None" +LFAI_REPO_ID="TheBloke/SynthIA-7B-v2.0-GPTQ" +LFAI_REVISION="gptq-4bit-32g-actorder_True" + +VLLM_TENSOR_PARALLEL_SIZE=1 +VLLM_TRUST_REMOTE_CODE=True +VLLM_MAX_CONTEXT_LENGTH=32768 +VLLM_ENFORCE_EAGER=False +VLLM_GPU_MEMORY_UTILIZATION=0.90 +VLLM_WORKER_USE_RAY=True +VLLM_ENGINE_USE_RAY=True +VLLM_QUANTIZATION="None" diff --git a/packages/vllm/README.md b/packages/vllm/README.md index dcf0b9c57..3f9201598 100644 --- a/packages/vllm/README.md +++ b/packages/vllm/README.md @@ -39,6 +39,14 @@ uds zarf package deploy packages/vllm/zarf-package-vllm-*-dev.tar.zst --confirm ### Local Development +The [config.yaml](./config.yaml) and [.env.example](./.env.example) must be modified if the model has changed away from the default. + +Create a `.env` file based on the [.env.example](./.env.example): + +```bash +cp .env.example .env +``` + To run the vllm backend locally: > [!IMPORTANT] diff --git a/packages/vllm/config.yaml b/packages/vllm/config.yaml new file mode 100644 index 000000000..22210a74b --- /dev/null +++ b/packages/vllm/config.yaml @@ -0,0 +1,17 @@ +model: + source: ".model/" +max_context_length: 32768 +stop_tokens: + - "<|im_end|>" + - "<|endoftext|>" + - "" +prompt_format: + chat: + system: "SYSTEM: {}\n" + assistant: "ASSISTANT: {}\n" + user: "USER: {}\n" +defaults: + top_p: 1.0 + top_k: 0 + repetition_penalty: 1.0 + max_new_tokens: 8192 diff --git a/packages/vllm/src/config.py b/packages/vllm/src/config.py index a8ae8a089..f7a09fd69 100644 --- a/packages/vllm/src/config.py +++ b/packages/vllm/src/config.py @@ -74,12 +74,13 @@ class DownloadOptions(BaseConfig): ) +# vLLM specific runtime configuration options class AppConfig(BaseConfig): backend_options: ConfigOptions CONFIG_SOURCES = [ EnvSource( allow_all=True, - prefix="LFAI_", + prefix="VLLM_", remap={ "tensor_parallel_size": "backend_options.tensor_parallel_size", "trust_remote_code": "backend_options.trust_remote_code", diff --git a/packages/vllm/src/main.py b/packages/vllm/src/main.py index a42c2d431..d3afbe2e5 100644 --- a/packages/vllm/src/main.py +++ b/packages/vllm/src/main.py @@ -97,14 +97,7 @@ def get_backend_configs(): allow_all=True, prefix="LFAI_", remap={ - "model_source": "model.source", - "max_context_length": "max_context_length", "stop_tokens": "stop_tokens", - "prompt_format_chat_system": "prompt_format.chat.system", - "prompt_format_chat_assistant": "prompt_format.chat.assistant", - "prompt_format_chat_user": "prompt_format.chat.user", - "prompt_format_defaults_top_p": "prompt_format.defaults.top_p", - "prompt_format_defaults_top_k": "prompt_format.defaults.top_k", }, ) diff --git a/packages/vllm/zarf.yaml b/packages/vllm/zarf.yaml index 27280f766..b23cf84f0 100644 --- a/packages/vllm/zarf.yaml +++ b/packages/vllm/zarf.yaml @@ -19,7 +19,7 @@ constants: value: "/data/.model/" variables: - # vLLM runtime configuration + # vLLM runtime configuration (usually influenced by .env in local development) - name: TRUST_REMOTE_CODE description: "If True, allows the execution of code within the model files directory" default: "True" @@ -47,7 +47,7 @@ variables: - name: QUANTIZATION description: "If None, allows vLLM to automatically detect via model files and configuration" default: "None" - # LeapfrogAI SDK runtime configuration + # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development) - name: MAX_CONTEXT_LENGTH description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used" default: "32768" @@ -132,5 +132,5 @@ components: # NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed - cmd: "python src/model_download.py" env: - - LFAI_REPO_ID=###ZARF_CONST_MODEL_REPO_ID### - - LFAI_REVISION=###ZARF_CONST_MODEL_REVISION### + - LFAI_REPO_ID="###ZARF_CONST_MODEL_REPO_ID###" + - LFAI_REVISION="###ZARF_CONST_MODEL_REVISION###"