diff --git a/packages/vllm/.env.example b/packages/vllm/.env.example index c649cd751..4b49e6f64 100644 --- a/packages/vllm/.env.example +++ b/packages/vllm/.env.example @@ -1,18 +1,18 @@ -LAI_HF_HUB_ENABLE_HF_TRANSFER="1" -LAI_REPO_ID="justinthelaw/Hermes-2-Pro-Mistral-7B-4bit-32g" -LAI_REVISION="main" -LAI_TENSOR_PARALLEL_SIZE=1 -LAI_TRUST_REMOTE_CODE=True -LAI_MODEL_SOURCE=".model/" -LAI_MAX_CONTEXT_LENGTH=32768 -LAI_STOP_TOKENS='[""]' -LAI_PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n" -LAI_PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n" -LAI_PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n" -LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 -LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0 -LAI_ENFORCE_EAGER=False -LAI_GPU_MEMORY_UTILIZATION=0.90 -LAI_WORKER_USE_RAY=True -LAI_ENGINE_USE_RAY=True -LAI_QUANTIZATION="None" +LFAI_HF_HUB_ENABLE_HF_TRANSFER="1" +LFAI_REPO_ID="defenseunicotns/Hermes-2-Pro-Mistral-7B-4bit-32g" +LFAI_REVISION="main" +LFAI_TENSOR_PARALLEL_SIZE=1 +LFAI_TRUST_REMOTE_CODE=True +LFAI_MODEL_SOURCE=".model/" +LFAI_MAX_CONTEXT_LENGTH=32768 +LFAI_STOP_TOKENS='[""]' +LFAI_PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n" +LFAI_PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n" +LFAI_PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n" +LFAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 +LFAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0 +LFAI_ENFORCE_EAGER=False +LFAI_GPU_MEMORY_UTILIZATION=0.90 +LFAI_WORKER_USE_RAY=True +LFAI_ENGINE_USE_RAY=True +LFAI_QUANTIZATION="None" diff --git a/packages/vllm/Dockerfile b/packages/vllm/Dockerfile index 2516487ac..cdd96eb5a 100755 --- a/packages/vllm/Dockerfile +++ b/packages/vllm/Dockerfile @@ -77,23 +77,6 @@ FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 # set SDK location ARG SDK_DEST=src/leapfrogai_sdk/build -# model-specific arguments -ARG TRUST_REMOTE_CODE="True" \ - MODEL_SOURCE="/data/.model/" \ - MAX_CONTEXT_LENGTH=32768 \ - STOP_TOKENS='[""]' \ - PROMPT_FORMAT_CHAT_SYSTEM="<|im_start|>system\n{}<|im_end|>\n" \ - PROMPT_FORMAT_CHAT_USER="<|im_start|>user\n{}<|im_end|>\n" \ - PROMPT_FORMAT_CHAT_ASSISTANT="<|im_start|>assistant\n{}<|im_end|>\n" \ - PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \ - PROMPT_FORMAT_DEFAULTS_TOP_K=0 \ - TENSOR_PARALLEL_SIZE=1 \ - ENFORCE_EAGER=False \ - QUANTIZATION="None" \ - GPU_MEMORY_UTILIZATION=0.90 \ - WORKER_USE_RAY=True \ - ENGINE_USE_RAY=True - # setup nonroot user and permissions USER root RUN groupadd -g 65532 vglusers && \ @@ -110,24 +93,8 @@ COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src / # copy-in python binaries COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/ -# load ARG values into env variables for pickup by confz -ENV LAI_TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE} \ - LAI_MODEL_SOURCE=${MODEL_SOURCE} \ - LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \ - LAI_STOP_TOKENS=${STOP_TOKENS} \ - LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \ - LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \ - LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \ - LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \ - LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \ - LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \ - LAI_QUANTIZATION=${QUANTIZATION} \ - LAI_ENFORCE_EAGER=${ENFORCE_EAGER} \ - LAI_GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION} \ - LAI_WORKER_USE_RAY=${WORKER_USE_RAY} \ - LAI_ENGINE_USE_RAY=${ENGINE_USE_RAY} \ - # remove vLLM callback to stats server - VLLM_NO_USAGE_STATS=1 +# remove vLLM callback to stats server +ENV VLLM_NO_USAGE_STATS=1 ENV PATH="/home/leapfrogai/.venv/bin:$PATH" diff --git a/packages/vllm/chart/templates/deployment.yaml b/packages/vllm/chart/templates/deployment.yaml index 7b88cc137..98b214cc6 100644 --- a/packages/vllm/chart/templates/deployment.yaml +++ b/packages/vllm/chart/templates/deployment.yaml @@ -36,7 +36,7 @@ spec: [ "sh", "-c", - 'while [ ! -f /data/.model/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"', + 'while [ ! -f ###ZARF_CONST_MODEL_PATH###/###ZARF_DATA_INJECTION_MARKER### ]; do echo "waiting for zarf data sync" && sleep 1; done; echo "we are done waiting!"', ] resources: {{- toYaml .Values.modelInjectionContainer.resources | nindent 12 }} @@ -44,8 +44,11 @@ spec: {{- toYaml .Values.modelInjectionContainer.volumeMounts | nindent 12 }} volumes: - name: leapfrogai-pv-storage - persistentVolumeClaim: - claimName: lfai-{{ .Values.nameOverride }}-pv-claim + persistentVolumeCLFAIm: + cLFAImName: lfai-{{ .Values.nameOverride }}-pv-cLFAIm + - name: leapfrogai-sdk-configmap + configMap: + name: "{{ .Values.nameOverride }}-sdk-configmap" securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: @@ -58,6 +61,9 @@ spec: env: {{- toYaml . | nindent 12 }} {{- end }} + envFrom: + - configMapRef: + name: "{{ .Values.nameOverride }}-env-configmap" ports: - name: http containerPort: {{ .Values.service.port }} @@ -67,6 +73,10 @@ spec: volumeMounts: - name: leapfrogai-pv-storage mountPath: "/data" + - name: leapfrogai-sdk-configmap + mountPath: "/home/leapfrogai/config.yaml" + subPath: "config.yaml" + readOnly: true {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml new file mode 100644 index 000000000..24c4461de --- /dev/null +++ b/packages/vllm/chart/templates/leapfrogai-sdk-configmap.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Values.nameOverride }}-sdk-configmap" + namespace: {{ .Release.Namespace | default "leapfrogai" }} +data: + config.yaml: | + model: + source: {{ .Values.leapfrogaiConfig.model.source }} + max_context_length: {{ .Values.leapfrogaiConfig.maxContextLength }} + stop_tokens: + {{- $stopTokens := .Values.leapfrogaiConfig.stopTokens }} + {{- range $stopToken := splitList ", " .Values.leapfrogaiConfig.stopTokens }} + - {{ printf "%s" $stopToken }} + {{- end }} + prompt_format: + {{- with .Values.leapfrogaiConfig.promptFormat.chat }} + chat: + {{- if .system }} + system: {{ .system }} + {{- end }} + {{- if .assistant }} + assistant: {{ .assistant }} + {{- end }} + {{- if .user }} + user: {{ .user }} + {{- end }} + {{- if .function }} + function: {{ .function }} + {{- end }} + {{- end }} + defaults: + temperature: {{ .Values.leapfrogaiConfig.defaults.temperature }} + top_p: {{ .Values.leapfrogaiConfig.defaults.topP }} + top_k: {{ .Values.leapfrogaiConfig.defaults.topK }} + repetition_penalty: {{ .Values.leapfrogaiConfig.defaults.repetitionPenalty }} + max_new_tokens: {{ .Values.leapfrogaiConfig.defaults.maxNewTokens }} diff --git a/packages/vllm/chart/templates/pvc.yaml b/packages/vllm/chart/templates/pvc.yaml index 3902bc25e..9ecf891fc 100644 --- a/packages/vllm/chart/templates/pvc.yaml +++ b/packages/vllm/chart/templates/pvc.yaml @@ -1,7 +1,7 @@ apiVersion: v1 -kind: PersistentVolumeClaim +kind: PersistentVolumeCLFAIm metadata: - name: lfai-{{ .Values.nameOverride }}-pv-claim + name: lfai-{{ .Values.nameOverride }}-pv-cLFAIm namespace: {{ .Release.Namespace | default "leapfrogai" }} spec: {{- if .Values.persistence.storageClass }} diff --git a/packages/vllm/chart/templates/vllm-engine-configmap.yaml b/packages/vllm/chart/templates/vllm-engine-configmap.yaml new file mode 100644 index 000000000..6c7931d29 --- /dev/null +++ b/packages/vllm/chart/templates/vllm-engine-configmap.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Values.nameOverride }}-engine-configmap" + namespace: {{ .Release.Namespace | default "leapfrogai" }} +data: + LFAI_TRUST_REMOTE_CODE: "{{ .Values.vllmConfig.trustRemoteCode }}" + LFAI_TENSOR_PARALLEL_SIZE: "{{ .Values.vllmConfig.tensorParallelSize }}" + LFAI_ENFORCE_EAGER: "{{ .Values.vllmConfig.enforceEager }}" + LFAI_GPU_MEMORY_UTILIZATION: "{{ .Values.vllmConfig.gpuMemoryUtilization }}" + LFAI_WORKER_USE_RAY: "{{ .Values.vllmConfig.workerUseRay }}" + LFAI_ENGINE_USE_RAY: "{{ .Values.vllmConfig.engineUseRay }}" + LFAI_QUANTIZATION: "{{ .Values.vllmConfig.quantization }}" diff --git a/packages/vllm/chart/values.yaml b/packages/vllm/chart/values.yaml index dbe88e500..d4968b726 100644 --- a/packages/vllm/chart/values.yaml +++ b/packages/vllm/chart/values.yaml @@ -13,6 +13,32 @@ image: nameOverride: "vllm" fullnameOverride: "" +leapfrogaiConfig: + model: + source: "/data/.model/" + maxContextLength: "32768" + stopTokens: ", <|im_end|>, <|endoftext|>" + promptFormat: + chat: + system: "SYSTEM: {}\n" + assistant: "ASSISTANT: {}\n" + user: "USER: {}\n" + defaults: + temperature: "0.1" + topP: "1.0" + topK: "0" + repetitionPenalty: "1.0" + maxNewTokens: "8192" + +vllmConfig: + trustRemoteCode: "True" + tensorParallelSize: "1" + enforceEager: "False" + gpuMemoryUtilization: "0.90" + workerUseRay: "True" + engineUseRay: "True" + quantization: "None" + env: - name: LFAI_LOG_LEVEL value: "INFO" diff --git a/packages/vllm/src/config.py b/packages/vllm/src/config.py index 00d3e11b7..3866e72b1 100644 --- a/packages/vllm/src/config.py +++ b/packages/vllm/src/config.py @@ -84,9 +84,8 @@ class AppConfig(BaseConfig): CONFIG_SOURCES = [ EnvSource( allow_all=True, - prefix="LAI_", + prefix="LFAI_", remap={ - "model_source": "backend_options.model_source", "tensor_parallel_size": "backend_options.tensor_parallel_size", "trust_remote_code": "backend_options.trust_remote_code", "enforce_eager": "backend_options.enforce_eager", @@ -104,7 +103,7 @@ class DownloadConfig(BaseConfig): CONFIG_SOURCES = [ EnvSource( allow_all=True, - prefix="LAI_", + prefix="LFAI_", remap={ "repo_id": "download_options.repo_id", "revision": "download_options.revision", diff --git a/packages/vllm/src/main.py b/packages/vllm/src/main.py index e935a1f45..a42c2d431 100644 --- a/packages/vllm/src/main.py +++ b/packages/vllm/src/main.py @@ -86,17 +86,18 @@ def remove_iterator(self, async_iterable): def get_backend_configs(): # Manually load env var as ConfZ does not handle complex types (list) - stop_tokens: str | None = os.getenv("LAI_STOP_TOKENS") + stop_tokens: str | None = os.getenv("LFAI_STOP_TOKENS") if stop_tokens: processed_stop_tokens = json.loads(stop_tokens) else: processed_stop_tokens = [] - del os.environ["LAI_STOP_TOKENS"] + del os.environ["LFAI_STOP_TOKENS"] env_source = EnvSource( allow_all=True, - prefix="LAI_", + prefix="LFAI_", remap={ + "model_source": "model.source", "max_context_length": "max_context_length", "stop_tokens": "stop_tokens", "prompt_format_chat_system": "prompt_format.chat.system", @@ -165,12 +166,13 @@ def __init__(self): os.environ["VLLM_ALLOW_WORKER_USE_RAY"] = "1" self.backend_config = get_backend_configs() + self.model = self.backend_config.model.source self.engine_args = AsyncEngineArgs( # Taken from the LFAI SDK general LLM configuration + model=self.model, max_seq_len_to_capture=self.backend_config.max_context_length, max_model_len=self.backend_config.max_context_length, # Taken from the vLLM-specific configuration - model=AppConfig().backend_options.model_source, enforce_eager=AppConfig().backend_options.enforce_eager, quantization=quantization, engine_use_ray=AppConfig().backend_options.engine_use_ray, @@ -241,16 +243,41 @@ async def create_response( """Initiate a response generation for the given prompt and configuration, adding the result to the iterator pool.""" - sampling_params = SamplingParams( - temperature=config.temperature, - # Clamp top_p value to prevent float errors - top_p=clamp(config.top_p, 0.0 + sys.float_info.epsilon, 1.0), - # Restrict top_k to valid values, -1 disables top_k - top_k=config.top_k if config.top_k >= 1 else -1, - stop=self.backend_config.stop_tokens, - max_tokens=config.max_new_tokens, - skip_special_tokens=False, - ) + request = get_config_from_request(self.request) + + # Collect parameters from request, with default fallbacks defined in the LeapfrogAI SDK BackendConfig (config.yaml, ConfigMap) + params = { + "temperature": request.get("temperature", config.temperature), + "top_p": clamp( + request.get("top_p", config.top_p), 0.0 + sys.float_info.epsilon, 1.0 + ), + "top_k": request.get("top_k", config.top_k if config.top_k >= 1 else -1), + "stop": self.backend_config.stop_tokens, + "max_tokens": request.get("max_tokens", config.max_new_tokens), + "skip_special_tokens": request.get("skip_special_tokens", False), + } + + # Optional parameters that come from the request object and should be gracefully omitted if not present + optional_params = [ + "n", + "repetition_penalty", + "presence_penalty", + "best_of", + "logit_bias", + "return_full_text", + "truncate", + "typical_p", + "seed", + ] + + # Add only the optional parameters that exist in the request + for param in optional_params: + if param in request: + params[param] = request[param] + + # Pass the collected params to SamplingParams + sampling_params = SamplingParams(**params) + logger.info(f"Begin generation for request {request_id}") logger.debug(f"{request_id} sampling_paramms: {sampling_params}") diff --git a/packages/vllm/values/upstream-values.yaml b/packages/vllm/values/upstream-values.yaml index 0fe581bdd..56019a38c 100644 --- a/packages/vllm/values/upstream-values.yaml +++ b/packages/vllm/values/upstream-values.yaml @@ -2,6 +2,39 @@ image: repository: "ghcr.io/defenseunicorns/leapfrogai/vllm" tag: "###ZARF_CONST_IMAGE_VERSION###" +nameOverride: "###ZARF_VAR_NAME_OVERRIDE###" + +leapfrogaiConfig: + model: + source: "###ZARF_CONST_MODEL_PATH###" + maxContextLength: "32768" + stopTokens: "###ZARF_VAR_STOP_TOKENS###" + promptFormat: + chat: + system: "###ZARF_VAR_PROMPT_FORMAT_CHAT_SYSTEM###" + assistant: "###ZARF_VAR_PROMPT_FORMAT_CHAT_ASSISTANT###" + user: "###ZARF_VAR_PROMPT_FORMAT_CHAT_USER###" + defaults: + temperature: "###ZARF_VAR_TEMPERATURE###" + topP: "###ZARF_VAR_TOP_P###" + topK: "###ZARF_VAR_TOP_K###" + repetitionPenalty: "###ZARF_VAR_REPETITION_PENALTY###" + maxNewTokens: "###ZARF_VAR_MAX_NEW_TOKENS###" + + +vllmConfig: + trustRemoteCode: "###ZARF_VAR_TRUST_REMOTE_CODE###" + tensorParallelSize: "###ZARF_VAR_TENSOR_PARALLEL_SIZE###" + enforceEager: "###ZARF_VAR_ENFORCE_EAGER###" + gpuMemoryUtilization: "###ZARF_VAR_GPU_MEMORY_UTILIZATION###" + workerUseRay: "###ZARF_VAR_WORKER_USE_RAY###" + engineUseRay: "###ZARF_VAR_ENGINE_USE_RAY###" + quantization: "###ZARF_VAR_QUANTIZATION###" + +env: + - name: LFAI_LOG_LEVEL + value: "INFO" + gpu: runtimeClassName: "###ZARF_VAR_GPU_RUNTIME###" diff --git a/packages/vllm/zarf.yaml b/packages/vllm/zarf.yaml index 40df92e43..27280f766 100644 --- a/packages/vllm/zarf.yaml +++ b/packages/vllm/zarf.yaml @@ -9,26 +9,93 @@ metadata: constants: - name: IMAGE_VERSION value: "###ZARF_PKG_TMPL_IMAGE_VERSION###" + - name: MODEL_REPO_ID + description: "The HuggingFace repository ID" + value: "TheBloke/Synthia-7B-v2.0-GPTQ" + - name: MODEL_REVISION + value: "gptq-4bit-32g-actorder_True" + - name: MODEL_PATH + description: "Location of the Zarf Injected model files" + value: "/data/.model/" variables: + # vLLM runtime configuration + - name: TRUST_REMOTE_CODE + description: "If True, allows the execution of code within the model files directory" + default: "True" + pattern: "^(True|False)$" + - name: TENSOR_PARALLEL_SIZE + description: "The number of tensor parallelism splits, typically used for model parallelism across GPUs" + default: "1" + pattern: "^[1-9][0-9]*$" + - name: ENFORCE_EAGER + description: "If set to True, enforces eager execution mode instead of lazy execution, impacting performance" + default: "False" + pattern: "^(True|False)$" + - name: GPU_MEMORY_UTILIZATION + description: "The fraction of GPU memory to be utilized, expressed as a decimal value between 0.01 and 0.99" + default: "0.90" + pattern: ^0\.(0[1-9]|[1-9][0-9])$ + - name: WORKER_USE_RAY + description: "If True, uses Ray for distributed worker management" + default: "True" + pattern: "^(True|False)$" + - name: ENGINE_USE_RAY + description: "If True, uses Ray for managing the execution engine" + default: "True" + pattern: "^(True|False)$" + - name: QUANTIZATION + description: "If None, allows vLLM to automatically detect via model files and configuration" + default: "None" + # LeapfrogAI SDK runtime configuration + - name: MAX_CONTEXT_LENGTH + description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used" + default: "32768" + pattern: "^[1-9][0-9]*$" + - name: STOP_TOKENS + description: "A set of special tokens that signal the model to stop producing further output, delimited using a comma and space" + default: ", <|im_end|>, <|endoftext|>" + pattern: ^(<[^,]+>\s*,\s*)*<[^,]+>\s*$ + - name: PROMPT_FORMAT_CHAT_SYSTEM + default: "SYSTEM: {}\n" + - name: PROMPT_FORMAT_CHAT_USER + default: "USER: {}\n" + - name: PROMPT_FORMAT_CHAT_ASSISTANT + default: "ASSISTANT: {}\n" + - name: PROMPT_FORMAT_DEFAULTS_TOP_P + description: "The cumulative probability threshold for token sampling, where 1.0 represents no restriction" + default: "1.0" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + - name: PROMPT_FORMAT_DEFAULTS_TOP_K + description: "The number of top-K tokens to consider during sampling, where 0 disables top-K sampling" + default: "0" + pattern: ^\d+$ + - name: TEMPERATURE + description: "Controls the randomness of the model's output" + default: "0.1" + pattern: ^(0(\.\d+)?|1(\.0+)?)$ + # Pod deployment configuration - name: GPU_LIMIT - description: The GPU limit for the model inferencing. Must be 1 or more. + description: "The GPU limit for the model inferencing. Must be 1 or more." default: "1" pattern: "^[1-9][0-9]*$" - name: GPU_RUNTIME - description: The GPU runtime name for the model inferencing. + description: "The GPU runtime name for the model inferencing." default: "nvidia" pattern: "^(nvidia)?$" + - name: NAME_OVERRIDE + description: "Provide an override for the name of the deployment (e.g., the model name)" + default: "vllm" - name: PVC_SIZE - description: Size of the PVC used for model storage. + description: "Size of the PVC used for model storage." default: "15Gi" pattern: "^[0-9]+[a-zA-Z]+$" - name: PVC_ACCESS_MODE - description: Access mode of the PVC used for model storage. + description: "Access mode of the PVC used for model storage." default: "ReadWriteOnce" pattern: "^(ReadWriteOnce|ReadOnlyMany|ReadWriteMany)$" - name: PVC_STORAGE_CLASS - description: Storage class of the PVC used for model storage. + description: "Storage class of the PVC used for model storage." default: "local-path" components: @@ -37,31 +104,33 @@ components: only: flavor: upstream charts: - - name: vllm-model + - name: "###ZARF_VAR_NAME_OVERRIDE###-model" namespace: leapfrogai localPath: chart - releaseName: vllm-model + releaseName: "###ZARF_VAR_NAME_OVERRIDE###-model" # x-release-please-start-version version: 0.12.2 # x-release-please-end valuesFiles: - "values/upstream-values.yaml" images: - - ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION### - - cgr.dev/chainguard/bash:latest + - "ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###" + - "cgr.dev/chainguard/bash:latest" dataInjections: - - source: .model/ + # location where locally downloaded model files are located + - source: ".model/" target: - namespace: leapfrogai - selector: app=lfai-vllm - container: data-loader - path: /data/.model + namespace: "leapfrogai" + selector: "app=lfai-###ZARF_VAR_NAME_OVERRIDE###" + container: "data-loader" + # location in the container for injection of the model files + path: "###ZARF_CONST_MODEL_PATH###" compress: true actions: onCreate: before: # NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed - - cmd: python src/model_download.py + - cmd: "python src/model_download.py" env: - - LAI_REPO_ID=defenseunicorns/Hermes-2-Pro-Mistral-7B-4bit-32g - - LAI_REVISION=main + - LFAI_REPO_ID=###ZARF_CONST_MODEL_REPO_ID### + - LFAI_REVISION=###ZARF_CONST_MODEL_REVISION### diff --git a/src/leapfrogai_sdk/llm.py b/src/leapfrogai_sdk/llm.py index 6ce6bc4d2..439c4da1b 100644 --- a/src/leapfrogai_sdk/llm.py +++ b/src/leapfrogai_sdk/llm.py @@ -101,6 +101,7 @@ def create_completion_response( class NewClass(_cls): config: BackendConfig + request_config: GenerationConfig def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -127,6 +128,7 @@ def _build_gen_stream( watermark=request.watermark, seed=request.seed, ) + self.request_config = config return self.generate(prompt, config) async def ChatComplete(