minor

Signed-off-by: Woosuk Kwon <[email protected]>
vllm-project · Dec 10, 2024 · 426d3f6 · 426d3f6
1 parent e2baff8
commit 426d3f6
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -453,6 +453,7 @@ def execute_model(
         else:
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
+        attn_metadata.num_input_tokens = num_input_tokens
 
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
@@ -478,7 +479,6 @@ def execute_model(
 
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
-        attn_metadata.num_input_tokens = num_input_tokens
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
                 input_ids=input_ids,