From 5ecf50ae60455d5dd602bde7a81e6e685629ceaf Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 2 Jan 2025 08:40:48 -0800 Subject: [PATCH] [V1][Minor] Optimize token_ids_cpu copy Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_input_batch.py | 13 ++++++++----- vllm/v1/worker/gpu_model_runner.py | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e79145300fe06..f8a1427c6c26c 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -66,8 +66,9 @@ def __init__( pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() - self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) + self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) # Attention-related. self.block_table = torch.zeros( @@ -189,6 +190,7 @@ def add_request( end_idx = start_idx + len(request.output_token_ids) self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids + self.num_tokens[req_index] = request.num_tokens self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens num_blocks = len(request.block_ids) @@ -290,14 +292,15 @@ def condense(self, empty_req_indices: List[int]) -> None: self.req_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index - # TODO(woosuk): Optimize the copy of token_ids_cpu and - # block_table_cpu. - self.token_ids_cpu[empty_index] = self.token_ids_cpu[ - last_req_index] + num_tokens = self.num_tokens[last_req_index] + self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ + last_req_index, :num_tokens] + self.num_tokens[empty_index] = num_tokens self.num_prompt_tokens[empty_index] = \ self.num_prompt_tokens[last_req_index] self.num_computed_tokens_cpu[ empty_index] = self.num_computed_tokens_cpu[last_req_index] + # TODO(woosuk): Optimize the copy of block_table_cpu. self.block_table_cpu[empty_index] = self.block_table_cpu[ last_req_index] self.temperature_cpu[empty_index] = self.temperature_cpu[ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 995de54e8e0a0..75098b0330ac9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -644,6 +644,7 @@ def execute_model( # Append the sampled token to the output token ids. token_id = sampled_token_ids[i] self.input_batch.token_ids_cpu[i, seq_len] = token_id + self.input_batch.num_tokens[i] += 1 req_state.output_token_ids.append(token_id) else: # Ignore the sampled token from the partial request.