From e215b7dd65ad149d9f1abfc716d56c4cc3ada5cf Mon Sep 17 00:00:00 2001
From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com>
Date: Tue, 12 Nov 2024 18:05:34 +0000
Subject: [PATCH 1/4] [neo] Increase gpu_memory_utilization of lmi-dist engine
 in sharding jobs

---
 serving/docker/partition/sm_neo_shard.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py
index d499022bf..b6ec02f23 100644
--- a/serving/docker/partition/sm_neo_shard.py
+++ b/serving/docker/partition/sm_neo_shard.py
@@ -103,6 +103,7 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str,
             enforce_eager=True,
             disable_custom_all_reduce=True,
             distributed_executor_backend="mp",
+            gpu_memory_utilization=0.99,
         )
         engine = engine_from_args(engine_args)
 

From bca3148773de9df0de99441c86b30ca8efba6922 Mon Sep 17 00:00:00 2001
From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com>
Date: Tue, 12 Nov 2024 20:50:18 +0000
Subject: [PATCH 2/4] [neo] Support option.gpu_memory_utilization for sharding
 jobs

---
 serving/docker/partition/sm_neo_shard.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py
index b6ec02f23..4d3788b44 100644
--- a/serving/docker/partition/sm_neo_shard.py
+++ b/serving/docker/partition/sm_neo_shard.py
@@ -95,6 +95,9 @@ def copy_non_safetensors_files(self, input_dir: str, output_dir: str):
     def shard_lmi_dist_model(self, input_dir: str, output_dir: str,
                              pp_degree: int, tp_degree: int,
                              chunk_mb: int) -> None:
+        # Use default LMI value of 0.9 for gpu_memory_utilization, unless set otherwise
+        gpu_memory_utilization = float(
+            self.properties.get("option.gpu_memory_utilization", 0.9))
 
         engine_args = VllmEngineArgs(
             model=input_dir,
@@ -103,7 +106,7 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str,
             enforce_eager=True,
             disable_custom_all_reduce=True,
             distributed_executor_backend="mp",
-            gpu_memory_utilization=0.99,
+            gpu_memory_utilization=gpu_memory_utilization,
         )
         engine = engine_from_args(engine_args)
 

From 0d1b11d49f8b26c4ec94ab678941633caaf05361 Mon Sep 17 00:00:00 2001
From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com>
Date: Tue, 12 Nov 2024 21:37:25 +0000
Subject: [PATCH 3/4] [neo] Support option.enforce_eager for sharding jobs

---
 serving/docker/partition/sm_neo_shard.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py
index 4d3788b44..c4f48ca60 100644
--- a/serving/docker/partition/sm_neo_shard.py
+++ b/serving/docker/partition/sm_neo_shard.py
@@ -98,15 +98,19 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str,
         # Use default LMI value of 0.9 for gpu_memory_utilization, unless set otherwise
         gpu_memory_utilization = float(
             self.properties.get("option.gpu_memory_utilization", 0.9))
+        # Use default LMI value of False for enforce_eager, unless set otherwise
+        enforce_eager: bool = str(
+            self.properties.get("option.enforce_eager",
+                                False)).lower() == "true"
 
         engine_args = VllmEngineArgs(
             model=input_dir,
             pipeline_parallel_size=pp_degree,
             tensor_parallel_size=tp_degree,
-            enforce_eager=True,
             disable_custom_all_reduce=True,
             distributed_executor_backend="mp",
             gpu_memory_utilization=gpu_memory_utilization,
+            enforce_eager=enforce_eager,
         )
         engine = engine_from_args(engine_args)
 

From 1776eb6cbe255b18e18d31c0e6c1579f35c65564 Mon Sep 17 00:00:00 2001
From: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com>
Date: Tue, 12 Nov 2024 22:13:43 +0000
Subject: [PATCH 4/4] Add max_num_seqs & max_model_len

---
 serving/docker/partition/sm_neo_shard.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/serving/docker/partition/sm_neo_shard.py b/serving/docker/partition/sm_neo_shard.py
index c4f48ca60..dd12b8b80 100644
--- a/serving/docker/partition/sm_neo_shard.py
+++ b/serving/docker/partition/sm_neo_shard.py
@@ -95,13 +95,16 @@ def copy_non_safetensors_files(self, input_dir: str, output_dir: str):
     def shard_lmi_dist_model(self, input_dir: str, output_dir: str,
                              pp_degree: int, tp_degree: int,
                              chunk_mb: int) -> None:
-        # Use default LMI value of 0.9 for gpu_memory_utilization, unless set otherwise
+        # For engine args which can affect GPU memory utilization, use LMI defaults
+        # unless specified otherwise by the customer
         gpu_memory_utilization = float(
             self.properties.get("option.gpu_memory_utilization", 0.9))
-        # Use default LMI value of False for enforce_eager, unless set otherwise
         enforce_eager: bool = str(
             self.properties.get("option.enforce_eager",
                                 False)).lower() == "true"
+        max_rolling_batch_size = int(
+            self.properties.get("option.max_rolling_batch_size", 256))
+        max_model_len = int(self.properties.get("option.max_model_len", None))
 
         engine_args = VllmEngineArgs(
             model=input_dir,
@@ -111,6 +114,8 @@ def shard_lmi_dist_model(self, input_dir: str, output_dir: str,
             distributed_executor_backend="mp",
             gpu_memory_utilization=gpu_memory_utilization,
             enforce_eager=enforce_eager,
+            max_num_seqs=max_rolling_batch_size,
+            max_model_len=max_model_len,
         )
         engine = engine_from_args(engine_args)