Skip to content

Commit

Permalink
add blake3 dep and more review comments from Cody
Browse files Browse the repository at this point in the history
  • Loading branch information
alexm-neuralmagic committed Dec 10, 2024
1 parent 7304434 commit 185655c
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
1 change: 1 addition & 0 deletions requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0
requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.45.2 # Required for Llama 3.2 and Qwen2-VL.
tokenizers >= 0.19.1 # Required for Llama 3.
Expand Down
5 changes: 5 additions & 0 deletions vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def add_request(self, request: EngineCoreRequest):
"""Add request to the scheduler."""

if request.mm_hashes is not None:
# Here, if hash exists for an image, then it will be fetched
# from the cache, else it will be added to the cache.
# Note that the cache here is mirrored with the client side of the
# MM mapper, so anything that has a hash must have a HIT cache
# entry here as well.
request.mm_inputs = self.mm_input_mapper_server.process_inputs(
request.mm_inputs, request.mm_hashes)

Expand Down
14 changes: 6 additions & 8 deletions vllm/v1/engine/mm_input_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,15 @@ def __init__(

self.mm_cache = LRUDictCache(MM_CACHE_SIZE)

# Set to None to disable (TODO: Disable!)
self.mm_debug_cache_hit_ratio_steps = 32
# DEBUG: Set to None to disable
self.mm_debug_cache_hit_ratio_steps = None
self.mm_cache_hits = 0
self.mm_cache_misses = 0
self.mm_cache_total = 0

def cache_hit_ratio(self, steps) -> float:
total_steps = self.mm_cache_hits + self.mm_cache_misses

if total_steps > 0 and total_steps % steps == 0:
if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
logger.debug("MMInputMapper: cache_hit_ratio = %.2f",
self.mm_cache_hits / total_steps)
self.mm_cache_hits / self.mm_cache_total)

def process_inputs(
self,
Expand Down Expand Up @@ -93,8 +91,8 @@ def process_inputs(
mm_hash = mm_hashes[input_id]
mm_input = self.mm_cache.get(mm_hash)

self.mm_cache_total += 1
if mm_input is None:
self.mm_cache_misses += 1
if precomputed_mm_inputs is not None:
# Reuse precomputed input (for merged preprocessor)
mm_input = precomputed_mm_inputs[input_id]
Expand Down

0 comments on commit 185655c

Please sign in to comment.