From 28c3f121040dd80d3540fb7d36e0b7a4817da28c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 8 Jan 2024 13:13:08 -0800 Subject: [PATCH] [Minor] Remove unused code in attention (#2384) --- vllm/model_executor/layers/attention.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 6482875d1c55b..f1008ec8159f6 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -156,20 +156,15 @@ def forward( output = out.view_as(query) else: # Decoding run. - if key_cache is not None and value_cache is not None: - output = _paged_attention( - query, - key_cache, - value_cache, - input_metadata, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - ) - else: - # This happens during the initial memory profiling run for - # CUDA graphs. - output = torch.zeros_like(query) + output = _paged_attention( + query, + key_cache, + value_cache, + input_metadata, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + ) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size)