packages/vllm/zarf-config.yaml

package:
  create:
    set:
      # x-release-please-start-version
      image_version: "0.14.0"
      # x-release-please-end

      model_repo_id: "TheBloke/Synthia-7B-v2.0-GPTQ"
      model_revision: "gptq-4bit-32g-actorder_True"
      model_path: "/data/.model/"
      name_override: "vllm"
  deploy:
    set:
      # vLLM runtime configuration (usually influenced by .env in local development)
      trust_remote_code: "True"
      tensor_parallel_size: "1"
      enforce_eager: "False"
      gpu_memory_utilization: "0.90"
      worker_use_ray: "True"
      engine_use_ray: "True"
      quantization: "None"
      load_format: "auto"
      # LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
      max_context_length: "32768"
      stop_tokens: "</s>, <|im_end|>, <|endoftext|>"
      prompt_format_chat_system: "SYSTEM: {}\n"
      prompt_format_chat_user: "USER: {}\n"
      prompt_format_chat_assistant: "ASSISTANT: {}\n"
      temperature: "0.1"
      top_p: "1.0"
      top_k: "0"
      repetition_penalty: "1.0"
      max_new_tokens: "8192"
      # Pod deployment configuration
      gpu_limit: "1"
      gpu_runtime: "nvidia"
      pvc_size: "15Gi"
      pvc_access_mode: "ReadWriteOnce"
      pvc_storage_class: "local-path"