From 907ba31fec02142729eaa0207517241c0b5bf758 Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Wed, 13 Dec 2023 18:15:47 -0500 Subject: [PATCH] fix token counting (#520) --- 06_gpu_and_ml/vllm_mixtral.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/06_gpu_and_ml/vllm_mixtral.py b/06_gpu_and_ml/vllm_mixtral.py index 8593bf4aa..fe1e61ea2 100644 --- a/06_gpu_and_ml/vllm_mixtral.py +++ b/06_gpu_and_ml/vllm_mixtral.py @@ -136,16 +136,17 @@ async def completion_stream(self, user_question): sampling_params, request_id, ) - index = 0 + index, num_tokens = 0, 0 async for output in result_generator: if "\ufffd" == output.outputs[0].text[-1]: continue text_delta = output.outputs[0].text[index:] index = len(output.outputs[0].text) + num_tokens = len(output.outputs[0].token_ids) yield text_delta - print(f"Generated {index} tokens in {time.time() - t0:.2f}s") + print(f"Generated {num_tokens} tokens in {time.time() - t0:.2f}s") # ## Run the model