Skip to content

Commit

Permalink
fix some env sourcing
Browse files Browse the repository at this point in the history
  • Loading branch information
justinthelaw committed Sep 16, 2024
1 parent a7e3e88 commit e19f892
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 30 deletions.
29 changes: 11 additions & 18 deletions packages/vllm/.env.example
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
LFAI_HF_HUB_ENABLE_HF_TRANSFER="1"
LFAI_REPO_ID="defenseunicotns/Hermes-2-Pro-Mistral-7B-4bit-32g"
LFAI_REVISION="main"
LFAI_TENSOR_PARALLEL_SIZE=1
LFAI_TRUST_REMOTE_CODE=True
LFAI_MODEL_SOURCE=".model/"
LFAI_MAX_CONTEXT_LENGTH=32768
LFAI_STOP_TOKENS='["</s>"]'
LFAI_PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n"
LFAI_PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n"
LFAI_PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n"
LFAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
LFAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
LFAI_ENFORCE_EAGER=False
LFAI_GPU_MEMORY_UTILIZATION=0.90
LFAI_WORKER_USE_RAY=True
LFAI_ENGINE_USE_RAY=True
LFAI_QUANTIZATION="None"
LFAI_REPO_ID="TheBloke/SynthIA-7B-v2.0-GPTQ"
LFAI_REVISION="gptq-4bit-32g-actorder_True"

VLLM_TENSOR_PARALLEL_SIZE=1
VLLM_TRUST_REMOTE_CODE=True
VLLM_MAX_CONTEXT_LENGTH=32768
VLLM_ENFORCE_EAGER=False
VLLM_GPU_MEMORY_UTILIZATION=0.90
VLLM_WORKER_USE_RAY=True
VLLM_ENGINE_USE_RAY=True
VLLM_QUANTIZATION="None"
8 changes: 8 additions & 0 deletions packages/vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ uds zarf package deploy packages/vllm/zarf-package-vllm-*-dev.tar.zst --confirm

### Local Development

The [config.yaml](./config.yaml) and [.env.example](./.env.example) must be modified if the model has changed away from the default.

Create a `.env` file based on the [.env.example](./.env.example):

```bash
cp .env.example .env
```

To run the vllm backend locally:

> [!IMPORTANT]
Expand Down
17 changes: 17 additions & 0 deletions packages/vllm/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
model:
source: ".model/"
max_context_length: 32768
stop_tokens:
- "<|im_end|>"
- "<|endoftext|>"
- "</s>"
prompt_format:
chat:
system: "SYSTEM: {}\n"
assistant: "ASSISTANT: {}\n"
user: "USER: {}\n"
defaults:
top_p: 1.0
top_k: 0
repetition_penalty: 1.0
max_new_tokens: 8192
3 changes: 2 additions & 1 deletion packages/vllm/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,13 @@ class DownloadOptions(BaseConfig):
)


# vLLM specific runtime configuration options
class AppConfig(BaseConfig):
backend_options: ConfigOptions
CONFIG_SOURCES = [
EnvSource(
allow_all=True,
prefix="LFAI_",
prefix="VLLM_",
remap={
"tensor_parallel_size": "backend_options.tensor_parallel_size",
"trust_remote_code": "backend_options.trust_remote_code",
Expand Down
7 changes: 0 additions & 7 deletions packages/vllm/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,7 @@ def get_backend_configs():
allow_all=True,
prefix="LFAI_",
remap={
"model_source": "model.source",
"max_context_length": "max_context_length",
"stop_tokens": "stop_tokens",
"prompt_format_chat_system": "prompt_format.chat.system",
"prompt_format_chat_assistant": "prompt_format.chat.assistant",
"prompt_format_chat_user": "prompt_format.chat.user",
"prompt_format_defaults_top_p": "prompt_format.defaults.top_p",
"prompt_format_defaults_top_k": "prompt_format.defaults.top_k",
},
)

Expand Down
8 changes: 4 additions & 4 deletions packages/vllm/zarf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ constants:
value: "/data/.model/"

variables:
# vLLM runtime configuration
# vLLM runtime configuration (usually influenced by .env in local development)
- name: TRUST_REMOTE_CODE
description: "If True, allows the execution of code within the model files directory"
default: "True"
Expand Down Expand Up @@ -47,7 +47,7 @@ variables:
- name: QUANTIZATION
description: "If None, allows vLLM to automatically detect via model files and configuration"
default: "None"
# LeapfrogAI SDK runtime configuration
# LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
- name: MAX_CONTEXT_LENGTH
description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used"
default: "32768"
Expand Down Expand Up @@ -132,5 +132,5 @@ components:
# NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed
- cmd: "python src/model_download.py"
env:
- LFAI_REPO_ID=###ZARF_CONST_MODEL_REPO_ID###
- LFAI_REVISION=###ZARF_CONST_MODEL_REVISION###
- LFAI_REPO_ID="###ZARF_CONST_MODEL_REPO_ID###"
- LFAI_REVISION="###ZARF_CONST_MODEL_REVISION###"

0 comments on commit e19f892

Please sign in to comment.