From 907ba31fec02142729eaa0207517241c0b5bf758 Mon Sep 17 00:00:00 2001
From: Richard Gong <richardgong0@gmail.com>
Date: Wed, 13 Dec 2023 18:15:47 -0500
Subject: [PATCH] fix token counting (#520)

---
 06_gpu_and_ml/vllm_mixtral.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/06_gpu_and_ml/vllm_mixtral.py b/06_gpu_and_ml/vllm_mixtral.py
index 8593bf4aa..fe1e61ea2 100644
--- a/06_gpu_and_ml/vllm_mixtral.py
+++ b/06_gpu_and_ml/vllm_mixtral.py
@@ -136,16 +136,17 @@ async def completion_stream(self, user_question):
             sampling_params,
             request_id,
         )
-        index = 0
+        index, num_tokens = 0, 0
         async for output in result_generator:
             if "\ufffd" == output.outputs[0].text[-1]:
                 continue
             text_delta = output.outputs[0].text[index:]
             index = len(output.outputs[0].text)
+            num_tokens = len(output.outputs[0].token_ids)
 
             yield text_delta
 
-        print(f"Generated {index} tokens in {time.time() - t0:.2f}s")
+        print(f"Generated {num_tokens} tokens in {time.time() - t0:.2f}s")
 
 
 # ## Run the model