From e215b7dd65ad149d9f1abfc716d56c4cc3ada5cf Mon Sep 17 00:00:00 2001 From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:05:34 +0000 Subject: [PATCH 1/4] [neo] Increase gpu_memory_utilization of lmi-dist engine in sharding jobs --- serving/docker/partition/sm_neo_shard.py | 1 + 1 file changed, 1 insertion(+) diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py index d499022bf..b6ec02f23 100644 --- a/serving/docker/partition/sm_neo_shard.py +++ b/serving/docker/partition/sm_neo_shard.py @@ -103,6 +103,7 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str, enforce_eager=True, disable_custom_all_reduce=True, distributed_executor_backend="mp", + gpu_memory_utilization=0.99, ) engine = engine_from_args(engine_args) From bca3148773de9df0de99441c86b30ca8efba6922 Mon Sep 17 00:00:00 2001 From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com> Date: Tue, 12 Nov 2024 20:50:18 +0000 Subject: [PATCH 2/4] [neo] Support option.gpu_memory_utilization for sharding jobs --- serving/docker/partition/sm_neo_shard.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py index b6ec02f23..4d3788b44 100644 --- a/serving/docker/partition/sm_neo_shard.py +++ b/serving/docker/partition/sm_neo_shard.py @@ -95,6 +95,9 @@ def copy_non_safetensors_files(self, input_dir: str, output_dir: str): def shard_lmi_dist_model(self, input_dir: str, output_dir: str, pp_degree: int, tp_degree: int, chunk_mb: int) -> None: + # Use default LMI value of 0.9 for gpu_memory_utilization, unless set otherwise + gpu_memory_utilization = float( + self.properties.get("option.gpu_memory_utilization", 0.9)) engine_args = VllmEngineArgs( model=input_dir, @@ -103,7 +106,7 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str, enforce_eager=True, disable_custom_all_reduce=True, distributed_executor_backend="mp", - gpu_memory_utilization=0.99, + gpu_memory_utilization=gpu_memory_utilization, ) engine = engine_from_args(engine_args) From 0d1b11d49f8b26c4ec94ab678941633caaf05361 Mon Sep 17 00:00:00 2001 From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com> Date: Tue, 12 Nov 2024 21:37:25 +0000 Subject: [PATCH 3/4] [neo] Support option.enforce_eager for sharding jobs --- serving/docker/partition/sm_neo_shard.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py index 4d3788b44..c4f48ca60 100644 --- a/serving/docker/partition/sm_neo_shard.py +++ b/serving/docker/partition/sm_neo_shard.py @@ -98,15 +98,19 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str, # Use default LMI value of 0.9 for gpu_memory_utilization, unless set otherwise gpu_memory_utilization = float( self.properties.get("option.gpu_memory_utilization", 0.9)) + # Use default LMI value of False for enforce_eager, unless set otherwise + enforce_eager: bool = str( + self.properties.get("option.enforce_eager", + False)).lower() == "true" engine_args = VllmEngineArgs( model=input_dir, pipeline_parallel_size=pp_degree, tensor_parallel_size=tp_degree, - enforce_eager=True, disable_custom_all_reduce=True, distributed_executor_backend="mp", gpu_memory_utilization=gpu_memory_utilization, + enforce_eager=enforce_eager, ) engine = engine_from_args(engine_args) From 1776eb6cbe255b18e18d31c0e6c1579f35c65564 Mon Sep 17 00:00:00 2001 From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com> Date: Tue, 12 Nov 2024 22:13:43 +0000 Subject: [PATCH 4/4] Add max_num_seqs & max_model_len --- serving/docker/partition/sm_neo_shard.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py index c4f48ca60..dd12b8b80 100644 --- a/serving/docker/partition/sm_neo_shard.py +++ b/serving/docker/partition/sm_neo_shard.py @@ -95,13 +95,16 @@ def copy_non_safetensors_files(self, input_dir: str, output_dir: str): def shard_lmi_dist_model(self, input_dir: str, output_dir: str, pp_degree: int, tp_degree: int, chunk_mb: int) -> None: - # Use default LMI value of 0.9 for gpu_memory_utilization, unless set otherwise + # For engine args which can affect GPU memory utilization, use LMI defaults + # unless specified otherwise by the customer gpu_memory_utilization = float( self.properties.get("option.gpu_memory_utilization", 0.9)) - # Use default LMI value of False for enforce_eager, unless set otherwise enforce_eager: bool = str( self.properties.get("option.enforce_eager", False)).lower() == "true" + max_rolling_batch_size = int( + self.properties.get("option.max_rolling_batch_size", 256)) + max_model_len = int(self.properties.get("option.max_model_len", None)) engine_args = VllmEngineArgs( model=input_dir, @@ -111,6 +114,8 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str, distributed_executor_backend="mp", gpu_memory_utilization=gpu_memory_utilization, enforce_eager=enforce_eager, + max_num_seqs=max_rolling_batch_size, + max_model_len=max_model_len, ) engine = engine_from_args(engine_args)