From 8089b647ccf123a68c1dac7e2c4c5f4643f6857a Mon Sep 17 00:00:00 2001 From: Jorge Antonio Date: Fri, 27 Dec 2024 11:05:46 +0000 Subject: [PATCH 1/2] first commit --- docker-compose.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index 0e768aee..56d26c76 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -126,6 +126,8 @@ services: container_name: chat-completions profiles: [chat_completions_vllm] image: vllm/vllm-openai:v0.6.5 + environment: + - VLLM_ATTENTION_BACKEND=FLASH_ATTN # Use Flash Attention for better performance, possible values are: [TORCH_SDPA, FLASH_ATTN, FLASHINFER, ROCM_FLASH, XFORMERS] use FLASHINFER for fp8 quantized models ipc: host command: ${VLLM_ENGINE_ARGS} From ce06525245fe4cc4f4b969c1fe5c1abaf35ee9cc Mon Sep 17 00:00:00 2001 From: Jorge Antonio Date: Fri, 27 Dec 2024 11:07:13 +0000 Subject: [PATCH 2/2] add better docs --- docker-compose.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 56d26c76..ed5c5728 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -127,7 +127,14 @@ services: profiles: [chat_completions_vllm] image: vllm/vllm-openai:v0.6.5 environment: - - VLLM_ATTENTION_BACKEND=FLASH_ATTN # Use Flash Attention for better performance, possible values are: [TORCH_SDPA, FLASH_ATTN, FLASHINFER, ROCM_FLASH, XFORMERS] use FLASHINFER for fp8 quantized models + # Backend for attention computation + # Available options: + # - "TORCH_SDPA": use torch.nn.MultiheadAttention + # - "FLASH_ATTN": use FlashAttention + # - "XFORMERS": use XFormers + # - "ROCM_FLASH": use ROCmFlashAttention + # - "FLASHINFER": use flashinfer (recommended for fp8 quantized models) + - VLLM_ATTENTION_BACKEND=FLASH_ATTN ipc: host command: ${VLLM_ENGINE_ARGS}