diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 996e60bfee592..3e9887d4ac658 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -46,6 +46,7 @@ def phi3v_tokenizer(): tokenizer_id=PHI3V_MODEL_ID, enable_lora=False, max_num_seqs=5, + max_loras=0, max_input_length=None, ) @@ -70,6 +71,7 @@ def mllama_tokenizer(): MLLAMA_MODEL_ID, enable_lora=False, max_num_seqs=5, + max_loras=0, max_input_length=None, ) @@ -682,6 +684,7 @@ def get_conversation(is_hf: bool): MLLAMA_MODEL_ID, enable_lora=False, max_num_seqs=5, + max_loras=0, max_input_length=None, ) tokenizer = tokenizer_group.tokenizer @@ -728,6 +731,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): model, enable_lora=False, max_num_seqs=5, + max_loras=0, max_input_length=None, ) tokenizer = tokenizer_group.tokenizer @@ -777,6 +781,7 @@ def test_resolve_content_format_examples(template_path, expected_format): PHI3V_MODEL_ID, enable_lora=False, max_num_seqs=5, + max_loras=0, max_input_length=None, ) dummy_tokenizer = tokenizer_group.tokenizer diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index e8f8499aa88ca..aac66a9e9ab9e 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -49,6 +49,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, tokenizer_id="facebook/opt-125m", enable_lora=False, max_num_seqs=max_num_seqs, + max_loras=0, max_input_length=None, ) diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 3faaf326f5422..1e476e3533aac 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -36,6 +36,7 @@ async def test_tokenizer_group(tokenizer_group_type): tokenizer_id="gpt2", enable_lora=False, max_num_seqs=1, + max_loras=0, max_input_length=None, ) assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( @@ -60,6 +61,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type): tokenizer_id="gpt2", enable_lora=False, max_num_seqs=1, + max_loras=0, max_input_length=None, ) # Send multiple requests to the tokenizer group pool @@ -102,6 +104,7 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool): tokenizer_id="gpt2", enable_lora=False, max_num_seqs=1, + max_loras=0, max_input_length=None) with pytest.raises(AssertionError): tokenizer_pool.ping() @@ -113,6 +116,7 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool): tokenizer_id="gpt2", enable_lora=False, max_num_seqs=1, + max_loras=0, max_input_length=None) tokenizer_pool.ping() @@ -150,6 +154,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool): tokenizer_id="gpt2", enable_lora=False, max_num_seqs=1, + max_loras=0, max_input_length=None, fail_at=fail_at) tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy() @@ -177,6 +182,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool): tokenizer_id="gpt2", enable_lora=False, max_num_seqs=1, + max_loras=0, max_input_length=None, fail_at=fail_at) @@ -198,6 +204,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool): tokenizer_id="gpt2", enable_lora=False, max_num_seqs=1, + max_loras=0, max_input_length=2, fail_at=fail_at) tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ecc222f692c41..3f91caba60e09 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -619,7 +619,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup: model_config=self.model_config, scheduler_config=self.scheduler_config, parallel_config=self.parallel_config, - enable_lora=bool(self.lora_config)) + lora_config=self.lora_config) def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index fe21c58c775fe..f2bba72c476ac 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -93,8 +93,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, model_config=self.model_config, scheduler_config=engine_config.scheduler_config, parallel_config=engine_config.parallel_config, - enable_lora=bool(engine_config.lora_config), - ) + lora_config=engine_config.lora_config) self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer) diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 6a114b513f382..c0b3d2585a962 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,7 +1,7 @@ from typing import Optional, Type -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - TokenizerPoolConfig) +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig, TokenizerPoolConfig) from vllm.executor.ray_utils import ray from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup @@ -16,10 +16,11 @@ def init_tokenizer_from_configs(model_config: ModelConfig, scheduler_config: SchedulerConfig, parallel_config: ParallelConfig, - enable_lora: bool): + lora_config: LoRAConfig): init_kwargs = dict(tokenizer_id=model_config.tokenizer, - enable_lora=enable_lora, + enable_lora=bool(lora_config), max_num_seqs=scheduler_config.max_num_seqs, + max_loras=lora_config.max_loras if lora_config else 0, max_input_length=None, tokenizer_mode=model_config.tokenizer_mode, trust_remote_code=model_config.trust_remote_code, diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 9a999a0d6067d..3e9b56059db69 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -51,14 +51,15 @@ def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig], return cls(**init_kwargs) def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, - max_input_length: Optional[int], num_actors: int, - ray_actor_options: dict, **tokenizer_config): + max_loras: int, max_input_length: Optional[int], + num_actors: int, ray_actor_options: dict, **tokenizer_config): # Store a local copy of the TokenizerGroup for quick access # to underlying HF tokenizers. self._tokenizer_config = { "tokenizer_id": tokenizer_id, "enable_lora": enable_lora, "max_num_seqs": max_num_seqs, + "max_loras": max_loras, "max_input_length": max_input_length, **tokenizer_config } diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index e516eeabaadef..2e1fcf2de16d4 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -15,14 +15,15 @@ class TokenizerGroup(BaseTokenizerGroup): """A group of tokenizers that can be used for LoRA adapters.""" def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, - max_input_length: Optional[int], **tokenizer_config): + max_loras: int, max_input_length: Optional[int], + **tokenizer_config): self.tokenizer_id = tokenizer_id self.tokenizer_config = tokenizer_config self.enable_lora = enable_lora self.max_input_length = max_input_length self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) self.lora_tokenizers = LRUCache[AnyTokenizer]( - capacity=max_num_seqs if enable_lora else 0) + capacity=max(max_loras, max_num_seqs) if enable_lora else 0) @classmethod def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig], diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a17c8eac4b77c..175616a518ccf 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -51,7 +51,7 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - enable_lora=bool(vllm_config.lora_config)) + lora_config=vllm_config.lora_config) self.tokenizer.ping() # Request streams (map of request_id -> AsyncStream). diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index bd19d998a4adb..312c0242a45dd 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -46,7 +46,7 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - enable_lora=bool(vllm_config.lora_config)) + lora_config=vllm_config.lora_config) self.tokenizer.ping() # Processor (convert Inputs --> EngineCoreRequests)