[misc] use out argument for flash attention (#10822)

Signed-off-by: youkaichao <[email protected]>
vllm-project · Dec 2, 2024 · a4c4daf · a4c4daf
1 parent e95f275
commit a4c4daf
Show file tree

Hide file tree

Showing 13 changed files with 144 additions and 157 deletions.
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
@@ -247,5 +247,6 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
@@ -360,6 +360,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -448,5 +449,6 @@ def forward(
                 blocksparse_head_sliding_step=self.head_sliding_step,
             )
 
+        assert output is not None
         # Reshape the output tensor.
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -638,24 +638,27 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            output: shape = [num_tokens, num_heads, head_size]
             kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
+        NOTE: It in-place updates the output tensor.
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        assert output is not None, "Output tensor must be provided."
+
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
@@ -666,23 +669,12 @@ def forward(
                                  "requires setting cross-attention "
                                  "metadata attributes.")
 
-        num_heads: int = self.num_heads
-        head_size: int = self.head_size
-        num_kv_heads: int = self.num_kv_heads
         kv_cache_dtype: str = self.kv_cache_dtype
         softmax_scale: float = self.scale
         window_size = self.sliding_window
         alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
         logits_soft_cap: Optional[float] = self.logits_soft_cap
 
-        num_tokens, hidden_size = query.shape
-
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, num_heads, head_size)
-        if (key is not None) and (value is not None):
-            key = key.view(-1, num_kv_heads, head_size)
-            value = value.view(-1, num_kv_heads, head_size)
-
         if kv_cache.numel() > 0:
             key_cache = kv_cache[0]
             value_cache = kv_cache[1]
@@ -721,13 +713,13 @@ def forward(
         num_decode_query_tokens) = \
             get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
         decode_query = query[num_prefill_query_tokens:]
+        decode_output = output[num_prefill_query_tokens:]
         # QKV for prefill.
         query = query[:num_prefill_query_tokens]
+        prefill_output = output[:num_prefill_query_tokens]
         assert query.shape[0] == num_prefill_query_tokens
         assert decode_query.shape[0] == num_decode_query_tokens
 
-        prefill_output: Optional[torch.Tensor] = None
-        decode_output: Optional[torch.Tensor] = None
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
@@ -741,7 +733,7 @@ def forward(
                 key = key[:num_prefill_kv_tokens]
                 value = value[:num_prefill_kv_tokens]
 
-                prefill_output = flash_attn_varlen_func(
+                flash_attn_varlen_func(
                     q=query,
                     k=key,
                     v=value,
@@ -754,14 +746,15 @@ def forward(
                     window_size=window_size,
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
+                    out=prefill_output,
                 )
             else:
                 # prefix-enabled attention
                 assert attn_type == AttentionType.DECODER, (
                     "Only decoder-only models support prefix caching")
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-                prefill_output = flash_attn_varlen_func(  # noqa
+                flash_attn_varlen_func(  # noqa
                     q=query,
                     k=key_cache,
                     v=value_cache,
@@ -775,6 +768,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     block_table=prefill_meta.block_tables,
                     softcap=logits_soft_cap,
+                    out=prefill_output,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -788,7 +782,7 @@ def forward(
                 assert attn_type == AttentionType.DECODER, (
                     "Only decoder-only models support max_decode_query_len > 1"
                 )
-                decode_output = flash_attn_varlen_func(
+                flash_attn_varlen_func(
                     q=decode_query,
                     k=key_cache,
                     v=value_cache,
@@ -802,6 +796,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     block_table=decode_meta.block_tables,
+                    out=decode_output,
                 )
             else:
                 # Use flash_attn_with_kvcache for normal decoding.
@@ -810,7 +805,7 @@ def forward(
                     _,
                     block_tables_arg,
                 ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
-                decode_output = flash_attn_with_kvcache(
+                flash_attn_with_kvcache(
                     q=decode_query.unsqueeze(1),
                     k_cache=key_cache,
                     v_cache=value_cache,
@@ -821,20 +816,8 @@ def forward(
                     window_size=window_size,
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
-                ).squeeze(1)
-
-        if prefill_output is None:
-            assert decode_output is not None
-            return decode_output.view(num_decode_query_tokens, hidden_size)
-        if decode_output is None:
-            assert prefill_output is not None
-            return prefill_output.view(num_prefill_query_tokens, hidden_size)
-
-        assert decode_meta is not None
-        decode_output = decode_output.squeeze(1)
-        output = torch.cat([prefill_output, decode_output], dim=0)
-        return output.view(num_tokens, hidden_size)
-
+                    out=decode_output.unsqueeze(1),
+                )
         return output
 
 

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
@@ -774,7 +774,11 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+
+        # TODO: directly write to output tensor
+
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
@@ -145,6 +145,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 

diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
@@ -173,6 +173,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
 

diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
@@ -151,6 +151,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -415,6 +415,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
@@ -431,6 +431,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -417,6 +417,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -4,15 +4,14 @@
 import torch
 import torch.nn as nn
 
-import vllm.envs as envs
 from vllm.attention import AttentionMetadata, AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
 
 
@@ -97,14 +96,23 @@ def __init__(
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              blocksparse_params, logits_soft_cap)
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_kv_heads
         self.backend = backend_name_to_enum(attn_backend.get_name())
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
         # torch.compile works by registering the attention as one giant
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
-        self.use_direct_call = envs.VLLM_USE_V1 or not (
-            current_platform.is_cuda_alike() or current_platform.is_cpu())
+        self.use_direct_call = not current_platform.is_cuda_alike(
+        ) and not current_platform.is_cpu()
+
+        # For some attention backends, we allocate an output tensor before
+        # calling the custom op. When piecewise cudagraph is enabled, this
+        # makes sure the output tensor is allocated inside the cudagraph.
+        self.use_output = self.backend == _Backend.FLASH_ATTN or \
+            self.backend == _Backend.FLASH_ATTN_VLLM_V1
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
@@ -130,6 +138,22 @@ def forward(
                                      self._k_scale,
                                      self._v_scale,
                                      attn_type=attn_type)
+        elif self.use_output:
+            output = torch.empty_like(query)
+            hidden_size = query.size(-1)
+            # Reshape the query, key, and value tensors.
+            # NOTE(woosuk): We do this outside the custom op to minimize the
+            # CPU overheads from the non-CUDA-graph regions.
+            query = query.view(-1, self.num_heads, self.head_size)
+            output = output.view(-1, self.num_heads, self.head_size)
+            if key is not None:
+                key = key.view(-1, self.num_kv_heads, self.head_size)
+            if value is not None:
+                value = value.view(-1, self.num_kv_heads, self.head_size)
+            torch.ops.vllm.unified_attention_with_output(
+                query, key, value, output, kv_cache, attn_type,
+                self.layer_name)
+            return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
                                                     kv_cache, attn_type,
@@ -183,3 +207,47 @@ def unified_attention_fake(
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
 )
+
+
+def unified_attention_with_output(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.dynamic_forward_context
+    self = forward_context.static_forward_context[layer_name]
+    self.impl.forward(query,
+                      key,
+                      value,
+                      kv_cache,
+                      attn_metadata,
+                      self._k_scale,
+                      self._v_scale,
+                      attn_type=attn_type,
+                      output=output)
+
+
+def unified_attention_with_output_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="unified_attention_with_output",
+    op_func=unified_attention_with_output,
+    mutates_args=["kv_cache", "output"],
+    fake_impl=unified_attention_with_output_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/config.py b/vllm/config.py
@@ -2238,7 +2238,7 @@ class CompilationConfig(BaseModel):
     custom_ops: List[str] = Field(default_factory=list)
     splitting_ops: List[str] = Field(default_factory=lambda: [
         "vllm.unified_attention",
-        "vllm.unified_v1_flash_attention",
+        "vllm.unified_attention_with_output",
     ])
 
     use_inductor: bool = True