add blake3 dep and more review comments from Cody

vllm-project · Dec 10, 2024 · 185655c · 185655c
1 parent 7304434
commit 185655c
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 8 deletions.
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -3,6 +3,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
 requests >= 2.26.0
 tqdm
+blake3
 py-cpuinfo
 transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
 tokenizers >= 0.19.1  # Required for Llama 3.

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -90,6 +90,11 @@ def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
 
         if request.mm_hashes is not None:
+            # Here, if hash exists for an image, then it will be fetched
+            # from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client side of the
+            # MM mapper, so anything that has a hash must have a HIT cache
+            # entry here as well.
             request.mm_inputs = self.mm_input_mapper_server.process_inputs(
                 request.mm_inputs, request.mm_hashes)
 

diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
@@ -44,17 +44,15 @@ def __init__(
 
         self.mm_cache = LRUDictCache(MM_CACHE_SIZE)
 
-        # Set to None to disable (TODO: Disable!)
-        self.mm_debug_cache_hit_ratio_steps = 32
+        # DEBUG: Set to None to disable
+        self.mm_debug_cache_hit_ratio_steps = None
         self.mm_cache_hits = 0
-        self.mm_cache_misses = 0
+        self.mm_cache_total = 0
 
     def cache_hit_ratio(self, steps) -> float:
-        total_steps = self.mm_cache_hits + self.mm_cache_misses
-
-        if total_steps > 0 and total_steps % steps == 0:
+        if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f",
-                         self.mm_cache_hits / total_steps)
+                         self.mm_cache_hits / self.mm_cache_total)
 
     def process_inputs(
         self,
@@ -93,8 +91,8 @@ def process_inputs(
                 mm_hash = mm_hashes[input_id]
                 mm_input = self.mm_cache.get(mm_hash)
 
+            self.mm_cache_total += 1
             if mm_input is None:
-                self.mm_cache_misses += 1
                 if precomputed_mm_inputs is not None:
                     # Reuse precomputed input (for merged preprocessor)
                     mm_input = precomputed_mm_inputs[input_id]