Skip to content

Commit

Permalink
[LoRA] Change lora_tokenizers capacity
Browse files Browse the repository at this point in the history
Signed-off-by: Xin Yang <[email protected]>
  • Loading branch information
xyang16 committed Nov 30, 2024
1 parent 7e4bbda commit 3b88608
Show file tree
Hide file tree
Showing 10 changed files with 28 additions and 13 deletions.
5 changes: 5 additions & 0 deletions tests/entrypoints/test_chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def phi3v_tokenizer():
tokenizer_id=PHI3V_MODEL_ID,
enable_lora=False,
max_num_seqs=5,
max_loras=0,
max_input_length=None,
)

Expand All @@ -70,6 +71,7 @@ def mllama_tokenizer():
MLLAMA_MODEL_ID,
enable_lora=False,
max_num_seqs=5,
max_loras=0,
max_input_length=None,
)

Expand Down Expand Up @@ -682,6 +684,7 @@ def get_conversation(is_hf: bool):
MLLAMA_MODEL_ID,
enable_lora=False,
max_num_seqs=5,
max_loras=0,
max_input_length=None,
)
tokenizer = tokenizer_group.tokenizer
Expand Down Expand Up @@ -728,6 +731,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
model,
enable_lora=False,
max_num_seqs=5,
max_loras=0,
max_input_length=None,
)
tokenizer = tokenizer_group.tokenizer
Expand Down Expand Up @@ -777,6 +781,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
PHI3V_MODEL_ID,
enable_lora=False,
max_num_seqs=5,
max_loras=0,
max_input_length=None,
)
dummy_tokenizer = tokenizer_group.tokenizer
Expand Down
1 change: 1 addition & 0 deletions tests/test_cache_block_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
tokenizer_id="facebook/opt-125m",
enable_lora=False,
max_num_seqs=max_num_seqs,
max_loras=0,
max_input_length=None,
)

Expand Down
7 changes: 7 additions & 0 deletions tests/tokenization/test_tokenizer_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ async def test_tokenizer_group(tokenizer_group_type):
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_loras=0,
max_input_length=None,
)
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
Expand All @@ -60,6 +61,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_loras=0,
max_input_length=None,
)
# Send multiple requests to the tokenizer group pool
Expand Down Expand Up @@ -102,6 +104,7 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_loras=0,
max_input_length=None)
with pytest.raises(AssertionError):
tokenizer_pool.ping()
Expand All @@ -113,6 +116,7 @@ class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_loras=0,
max_input_length=None)
tokenizer_pool.ping()

Expand Down Expand Up @@ -150,6 +154,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_loras=0,
max_input_length=None,
fail_at=fail_at)
tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
Expand Down Expand Up @@ -177,6 +182,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_loras=0,
max_input_length=None,
fail_at=fail_at)

Expand All @@ -198,6 +204,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
tokenizer_id="gpt2",
enable_lora=False,
max_num_seqs=1,
max_loras=0,
max_input_length=2,
fail_at=fail_at)
tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup:
model_config=self.model_config,
scheduler_config=self.scheduler_config,
parallel_config=self.parallel_config,
enable_lora=bool(self.lora_config))
lora_config=self.lora_config)

def _verify_args(self) -> None:
self.model_config.verify_with_parallel_config(self.parallel_config)
Expand Down
3 changes: 1 addition & 2 deletions vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
parallel_config=engine_config.parallel_config,
enable_lora=bool(engine_config.lora_config),
)
lora_config=engine_config.lora_config)
self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer)

Expand Down
9 changes: 5 additions & 4 deletions vllm/transformers_utils/tokenizer_group/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional, Type

from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
TokenizerPoolConfig)
from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
SchedulerConfig, TokenizerPoolConfig)
from vllm.executor.ray_utils import ray

from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
Expand All @@ -16,10 +16,11 @@
def init_tokenizer_from_configs(model_config: ModelConfig,
scheduler_config: SchedulerConfig,
parallel_config: ParallelConfig,
enable_lora: bool):
lora_config: LoRAConfig):
init_kwargs = dict(tokenizer_id=model_config.tokenizer,
enable_lora=enable_lora,
enable_lora=bool(lora_config),
max_num_seqs=scheduler_config.max_num_seqs,
max_loras=lora_config.max_loras if lora_config else 0,
max_input_length=None,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,15 @@ def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
return cls(**init_kwargs)

def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
max_input_length: Optional[int], num_actors: int,
ray_actor_options: dict, **tokenizer_config):
max_loras: int, max_input_length: Optional[int],
num_actors: int, ray_actor_options: dict, **tokenizer_config):
# Store a local copy of the TokenizerGroup for quick access
# to underlying HF tokenizers.
self._tokenizer_config = {
"tokenizer_id": tokenizer_id,
"enable_lora": enable_lora,
"max_num_seqs": max_num_seqs,
"max_loras": max_loras,
"max_input_length": max_input_length,
**tokenizer_config
}
Expand Down
5 changes: 3 additions & 2 deletions vllm/transformers_utils/tokenizer_group/tokenizer_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@ class TokenizerGroup(BaseTokenizerGroup):
"""A group of tokenizers that can be used for LoRA adapters."""

def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
max_input_length: Optional[int], **tokenizer_config):
max_loras: int, max_input_length: Optional[int],
**tokenizer_config):
self.tokenizer_id = tokenizer_id
self.tokenizer_config = tokenizer_config
self.enable_lora = enable_lora
self.max_input_length = max_input_length
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
self.lora_tokenizers = LRUCache[AnyTokenizer](
capacity=max_num_seqs if enable_lora else 0)
capacity=max(max_loras, max_num_seqs) if enable_lora else 0)

@classmethod
def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(
model_config=vllm_config.model_config,
scheduler_config=vllm_config.scheduler_config,
parallel_config=vllm_config.parallel_config,
enable_lora=bool(vllm_config.lora_config))
lora_config=vllm_config.lora_config)
self.tokenizer.ping()

# Request streams (map of request_id -> AsyncStream).
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(
model_config=vllm_config.model_config,
scheduler_config=vllm_config.scheduler_config,
parallel_config=vllm_config.parallel_config,
enable_lora=bool(vllm_config.lora_config))
lora_config=vllm_config.lora_config)
self.tokenizer.ping()

# Processor (convert Inputs --> EngineCoreRequests)
Expand Down

0 comments on commit 3b88608

Please sign in to comment.