Skip to content

Commit

Permalink
configmap and default generation params
Browse files Browse the repository at this point in the history
  • Loading branch information
justinthelaw committed Sep 16, 2024
1 parent 2254b79 commit 962dd4d
Show file tree
Hide file tree
Showing 12 changed files with 275 additions and 92 deletions.
36 changes: 18 additions & 18 deletions packages/vllm/.env.example
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
LAI_HF_HUB_ENABLE_HF_TRANSFER="1"
LAI_REPO_ID="justinthelaw/Hermes-2-Pro-Mistral-7B-4bit-32g"
LAI_REVISION="main"
LAI_TENSOR_PARALLEL_SIZE=1
LAI_TRUST_REMOTE_CODE=True
LAI_MODEL_SOURCE=".model/"
LAI_MAX_CONTEXT_LENGTH=32768
LAI_STOP_TOKENS='["</s>"]'
LAI_PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n"
LAI_PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n"
LAI_PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n"
LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
LAI_ENFORCE_EAGER=False
LAI_GPU_MEMORY_UTILIZATION=0.90
LAI_WORKER_USE_RAY=True
LAI_ENGINE_USE_RAY=True
LAI_QUANTIZATION="None"
LFAI_HF_HUB_ENABLE_HF_TRANSFER="1"
LFAI_REPO_ID="defenseunicotns/Hermes-2-Pro-Mistral-7B-4bit-32g"
LFAI_REVISION="main"
LFAI_TENSOR_PARALLEL_SIZE=1
LFAI_TRUST_REMOTE_CODE=True
LFAI_MODEL_SOURCE=".model/"
LFAI_MAX_CONTEXT_LENGTH=32768
LFAI_STOP_TOKENS='["</s>"]'
LFAI_PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n"
LFAI_PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n"
LFAI_PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n"
LFAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
LFAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
LFAI_ENFORCE_EAGER=False
LFAI_GPU_MEMORY_UTILIZATION=0.90
LFAI_WORKER_USE_RAY=True
LFAI_ENGINE_USE_RAY=True
LFAI_QUANTIZATION="None"
37 changes: 2 additions & 35 deletions packages/vllm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,6 @@ FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
# set SDK location
ARG SDK_DEST=src/leapfrogai_sdk/build

# model-specific arguments
ARG TRUST_REMOTE_CODE="True" \
MODEL_SOURCE="/data/.model/" \
MAX_CONTEXT_LENGTH=32768 \
STOP_TOKENS='["</s>"]' \
PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n" \
PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n" \
PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n" \
PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \
PROMPT_FORMAT_DEFAULTS_TOP_K=0 \
TENSOR_PARALLEL_SIZE=1 \
ENFORCE_EAGER=False \
QUANTIZATION="None" \
GPU_MEMORY_UTILIZATION=0.90 \
WORKER_USE_RAY=True \
ENGINE_USE_RAY=True

# setup nonroot user and permissions
USER root
RUN groupadd -g 65532 vglusers && \
Expand All @@ -110,24 +93,8 @@ COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src /
# copy-in python binaries
COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/

# load ARG values into env variables for pickup by confz
ENV LAI_TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE} \
LAI_MODEL_SOURCE=${MODEL_SOURCE} \
LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \
LAI_STOP_TOKENS=${STOP_TOKENS} \
LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \
LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \
LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \
LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \
LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \
LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \
LAI_QUANTIZATION=${QUANTIZATION} \
LAI_ENFORCE_EAGER=${ENFORCE_EAGER} \
LAI_GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION} \
LAI_WORKER_USE_RAY=${WORKER_USE_RAY} \
LAI_ENGINE_USE_RAY=${ENGINE_USE_RAY} \
# remove vLLM callback to stats server
VLLM_NO_USAGE_STATS=1
# remove vLLM callback to stats server
ENV VLLM_NO_USAGE_STATS=1

ENV PATH="/home/leapfrogai/.venv/bin:$PATH"

Expand Down
16 changes: 13 additions & 3 deletions packages/vllm/chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,19 @@ spec:
[
"sh",
"-c",
'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
'while [ ! -f ###ZARF_CONST_MODEL_PATH###/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"',
]
resources:
{{- toYaml .Values.modelInjectionContainer.resources | nindent 12 }}
volumeMounts:
{{- toYaml .Values.modelInjectionContainer.volumeMounts | nindent 12 }}
volumes:
- name: leapfrogai-pv-storage
persistentVolumeClaim:
claimName: lfai-{{ .Values.nameOverride }}-pv-claim
persistentVolumeCLFAIm:
cLFAImName: lfai-{{ .Values.nameOverride }}-pv-cLFAIm
- name: leapfrogai-sdk-configmap
configMap:
name: "{{ .Values.nameOverride }}-sdk-configmap"
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
Expand All @@ -58,6 +61,9 @@ spec:
env:
{{- toYaml . | nindent 12 }}
{{- end }}
envFrom:
- configMapRef:
name: "{{ .Values.nameOverride }}-env-configmap"
ports:
- name: http
containerPort: {{ .Values.service.port }}
Expand All @@ -67,6 +73,10 @@ spec:
volumeMounts:
- name: leapfrogai-pv-storage
mountPath: "/data"
- name: leapfrogai-sdk-configmap
mountPath: "/home/leapfrogai/config.yaml"
subPath: "config.yaml"
readOnly: true
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
37 changes: 37 additions & 0 deletions packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: "{{ .Values.nameOverride }}-sdk-configmap"
namespace: {{ .Release.Namespace | default "leapfrogai" }}
data:
config.yaml: |
model:
source: {{ .Values.leapfrogaiConfig.model.source }}
max_context_length: {{ .Values.leapfrogaiConfig.maxContextLength }}
stop_tokens:
{{- $stopTokens := .Values.leapfrogaiConfig.stopTokens }}
{{- range $stopToken := splitList ", " .Values.leapfrogaiConfig.stopTokens }}
- {{ printf "%s" $stopToken }}
{{- end }}
prompt_format:
{{- with .Values.leapfrogaiConfig.promptFormat.chat }}
chat:
{{- if .system }}
system: {{ .system }}
{{- end }}
{{- if .assistant }}
assistant: {{ .assistant }}
{{- end }}
{{- if .user }}
user: {{ .user }}
{{- end }}
{{- if .function }}
function: {{ .function }}
{{- end }}
{{- end }}
defaults:
temperature: {{ .Values.leapfrogaiConfig.defaults.temperature }}
top_p: {{ .Values.leapfrogaiConfig.defaults.topP }}
top_k: {{ .Values.leapfrogaiConfig.defaults.topK }}
repetition_penalty: {{ .Values.leapfrogaiConfig.defaults.repetitionPenalty }}
max_new_tokens: {{ .Values.leapfrogaiConfig.defaults.maxNewTokens }}
4 changes: 2 additions & 2 deletions packages/vllm/chart/templates/pvc.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v1
kind: PersistentVolumeClaim
kind: PersistentVolumeCLFAIm
metadata:
name: lfai-{{ .Values.nameOverride }}-pv-claim
name: lfai-{{ .Values.nameOverride }}-pv-cLFAIm
namespace: {{ .Release.Namespace | default "leapfrogai" }}
spec:
{{- if .Values.persistence.storageClass }}
Expand Down
13 changes: 13 additions & 0 deletions packages/vllm/chart/templates/vllm-engine-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: "{{ .Values.nameOverride }}-engine-configmap"
namespace: {{ .Release.Namespace | default "leapfrogai" }}
data:
LFAI_TRUST_REMOTE_CODE: "{{ .Values.vllmConfig.trustRemoteCode }}"
LFAI_TENSOR_PARALLEL_SIZE: "{{ .Values.vllmConfig.tensorParallelSize }}"
LFAI_ENFORCE_EAGER: "{{ .Values.vllmConfig.enforceEager }}"
LFAI_GPU_MEMORY_UTILIZATION: "{{ .Values.vllmConfig.gpuMemoryUtilization }}"
LFAI_WORKER_USE_RAY: "{{ .Values.vllmConfig.workerUseRay }}"
LFAI_ENGINE_USE_RAY: "{{ .Values.vllmConfig.engineUseRay }}"
LFAI_QUANTIZATION: "{{ .Values.vllmConfig.quantization }}"
26 changes: 26 additions & 0 deletions packages/vllm/chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,32 @@ image:
nameOverride: "vllm"
fullnameOverride: ""

leapfrogaiConfig:
model:
source: "/data/.model/"
maxContextLength: "32768"
stopTokens: "</s>, <|im_end|>, <|endoftext|>"
promptFormat:
chat:
system: "SYSTEM: {}\n"
assistant: "ASSISTANT: {}\n"
user: "USER: {}\n"
defaults:
temperature: "0.1"
topP: "1.0"
topK: "0"
repetitionPenalty: "1.0"
maxNewTokens: "8192"

vllmConfig:
trustRemoteCode: "True"
tensorParallelSize: "1"
enforceEager: "False"
gpuMemoryUtilization: "0.90"
workerUseRay: "True"
engineUseRay: "True"
quantization: "None"

env:
- name: LFAI_LOG_LEVEL
value: "INFO"
Expand Down
5 changes: 2 additions & 3 deletions packages/vllm/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,8 @@ class AppConfig(BaseConfig):
CONFIG_SOURCES = [
EnvSource(
allow_all=True,
prefix="LAI_",
prefix="LFAI_",
remap={
"model_source": "backend_options.model_source",
"tensor_parallel_size": "backend_options.tensor_parallel_size",
"trust_remote_code": "backend_options.trust_remote_code",
"enforce_eager": "backend_options.enforce_eager",
Expand All @@ -104,7 +103,7 @@ class DownloadConfig(BaseConfig):
CONFIG_SOURCES = [
EnvSource(
allow_all=True,
prefix="LAI_",
prefix="LFAI_",
remap={
"repo_id": "download_options.repo_id",
"revision": "download_options.revision",
Expand Down
55 changes: 41 additions & 14 deletions packages/vllm/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,18 @@ def remove_iterator(self, async_iterable):

def get_backend_configs():
# Manually load env var as ConfZ does not handle complex types (list)
stop_tokens: str | None = os.getenv("LAI_STOP_TOKENS")
stop_tokens: str | None = os.getenv("LFAI_STOP_TOKENS")
if stop_tokens:
processed_stop_tokens = json.loads(stop_tokens)
else:
processed_stop_tokens = []
del os.environ["LAI_STOP_TOKENS"]
del os.environ["LFAI_STOP_TOKENS"]

env_source = EnvSource(
allow_all=True,
prefix="LAI_",
prefix="LFAI_",
remap={
"model_source": "model.source",
"max_context_length": "max_context_length",
"stop_tokens": "stop_tokens",
"prompt_format_chat_system": "prompt_format.chat.system",
Expand Down Expand Up @@ -165,12 +166,13 @@ def __init__(self):
os.environ["VLLM_ALLOW_WORKER_USE_RAY"] = "1"

self.backend_config = get_backend_configs()
self.model = self.backend_config.model.source
self.engine_args = AsyncEngineArgs(
# Taken from the LFAI SDK general LLM configuration
model=self.model,
max_seq_len_to_capture=self.backend_config.max_context_length,
max_model_len=self.backend_config.max_context_length,
# Taken from the vLLM-specific configuration
model=AppConfig().backend_options.model_source,
enforce_eager=AppConfig().backend_options.enforce_eager,
quantization=quantization,
engine_use_ray=AppConfig().backend_options.engine_use_ray,
Expand Down Expand Up @@ -241,16 +243,41 @@ async def create_response(
"""Initiate a response generation for the given prompt and configuration, adding the result to the iterator
pool."""

sampling_params = SamplingParams(
temperature=config.temperature,
# Clamp top_p value to prevent float errors
top_p=clamp(config.top_p, 0.0 + sys.float_info.epsilon, 1.0),
# Restrict top_k to valid values, -1 disables top_k
top_k=config.top_k if config.top_k >= 1 else -1,
stop=self.backend_config.stop_tokens,
max_tokens=config.max_new_tokens,
skip_special_tokens=False,
)
request = get_config_from_request(self.request)

# Collect parameters from request, with default fallbacks defined in the LeapfrogAI SDK BackendConfig (config.yaml, ConfigMap)
params = {
"temperature": request.get("temperature", config.temperature),
"top_p": clamp(
request.get("top_p", config.top_p), 0.0 + sys.float_info.epsilon, 1.0
),
"top_k": request.get("top_k", config.top_k if config.top_k >= 1 else -1),
"stop": self.backend_config.stop_tokens,
"max_tokens": request.get("max_tokens", config.max_new_tokens),
"skip_special_tokens": request.get("skip_special_tokens", False),
}

# Optional parameters that come from the request object and should be gracefully omitted if not present
optional_params = [
"n",
"repetition_penalty",
"presence_penalty",
"best_of",
"logit_bias",
"return_full_text",
"truncate",
"typical_p",
"seed",
]

# Add only the optional parameters that exist in the request
for param in optional_params:
if param in request:
params[param] = request[param]

# Pass the collected params to SamplingParams
sampling_params = SamplingParams(**params)

logger.info(f"Begin generation for request {request_id}")
logger.debug(f"{request_id} sampling_paramms: {sampling_params}")

Expand Down
33 changes: 33 additions & 0 deletions packages/vllm/values/upstream-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,39 @@ image:
repository: "ghcr.io/defenseunicorns/leapfrogai/vllm"
tag: "###ZARF_CONST_IMAGE_VERSION###"

nameOverride: "###ZARF_VAR_NAME_OVERRIDE###"

leapfrogaiConfig:
model:
source: "###ZARF_CONST_MODEL_PATH###"
maxContextLength: "32768"
stopTokens: "###ZARF_VAR_STOP_TOKENS###"
promptFormat:
chat:
system: "###ZARF_VAR_PROMPT_FORMAT_CHAT_SYSTEM###"
assistant: "###ZARF_VAR_PROMPT_FORMAT_CHAT_ASSISTANT###"
user: "###ZARF_VAR_PROMPT_FORMAT_CHAT_USER###"
defaults:
temperature: "###ZARF_VAR_TEMPERATURE###"
topP: "###ZARF_VAR_TOP_P###"
topK: "###ZARF_VAR_TOP_K###"
repetitionPenalty: "###ZARF_VAR_REPETITION_PENALTY###"
maxNewTokens: "###ZARF_VAR_MAX_NEW_TOKENS###"


vllmConfig:
trustRemoteCode: "###ZARF_VAR_TRUST_REMOTE_CODE###"
tensorParallelSize: "###ZARF_VAR_TENSOR_PARALLEL_SIZE###"
enforceEager: "###ZARF_VAR_ENFORCE_EAGER###"
gpuMemoryUtilization: "###ZARF_VAR_GPU_MEMORY_UTILIZATION###"
workerUseRay: "###ZARF_VAR_WORKER_USE_RAY###"
engineUseRay: "###ZARF_VAR_ENGINE_USE_RAY###"
quantization: "###ZARF_VAR_QUANTIZATION###"

env:
- name: LFAI_LOG_LEVEL
value: "INFO"

gpu:
runtimeClassName: "###ZARF_VAR_GPU_RUNTIME###"

Expand Down
Loading

0 comments on commit 962dd4d

Please sign in to comment.