diff --git a/requirements-common.txt b/requirements-common.txt index 112528880c0ac..e2347c72a6167 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -3,6 +3,7 @@ sentencepiece # Required for LLaMA tokenizer. numpy < 2.0.0 requests >= 2.26.0 tqdm +blake3 py-cpuinfo transformers >= 4.45.2 # Required for Llama 3.2 and Qwen2-VL. tokenizers >= 0.19.1 # Required for Llama 3. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 625e2d943666f..08096020662f5 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -90,6 +90,11 @@ def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" if request.mm_hashes is not None: + # Here, if hash exists for an image, then it will be fetched + # from the cache, else it will be added to the cache. + # Note that the cache here is mirrored with the client side of the + # MM mapper, so anything that has a hash must have a HIT cache + # entry here as well. request.mm_inputs = self.mm_input_mapper_server.process_inputs( request.mm_inputs, request.mm_hashes) diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index 68c476cbca07b..9feb7040da393 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -44,17 +44,15 @@ def __init__( self.mm_cache = LRUDictCache(MM_CACHE_SIZE) - # Set to None to disable (TODO: Disable!) - self.mm_debug_cache_hit_ratio_steps = 32 + # DEBUG: Set to None to disable + self.mm_debug_cache_hit_ratio_steps = None self.mm_cache_hits = 0 - self.mm_cache_misses = 0 + self.mm_cache_total = 0 def cache_hit_ratio(self, steps) -> float: - total_steps = self.mm_cache_hits + self.mm_cache_misses - - if total_steps > 0 and total_steps % steps == 0: + if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0: logger.debug("MMInputMapper: cache_hit_ratio = %.2f", - self.mm_cache_hits / total_steps) + self.mm_cache_hits / self.mm_cache_total) def process_inputs( self, @@ -93,8 +91,8 @@ def process_inputs( mm_hash = mm_hashes[input_id] mm_input = self.mm_cache.get(mm_hash) + self.mm_cache_total += 1 if mm_input is None: - self.mm_cache_misses += 1 if precomputed_mm_inputs is not None: # Reuse precomputed input (for merged preprocessor) mm_input = precomputed_mm_inputs[input_id]