From bfcfbbab62b4d6795611eb8cdb9a1e662388403b Mon Sep 17 00:00:00 2001
From: Yunfeng Bai <83252681+yunfeng-scale@users.noreply.github.com>
Date: Fri, 8 Mar 2024 13:05:50 -0800
Subject: [PATCH] Change back batch infer GPU util and add tool completion
 client changes (#465)

* Change back batch infer gpu util

* Add client changes

* fixes

* bump
---
 clients/python/llmengine/__init__.py          |  2 +-
 clients/python/llmengine/completion.py        | 33 +++++++++++++++++++
 clients/python/llmengine/data_types.py        | 30 +++++++++++++++++
 clients/python/pyproject.toml                 |  2 +-
 clients/python/setup.py                       |  2 +-
 docs/api/data_types.md                        |  1 +
 docs/guides/completions.md                    | 26 ++++++++++++++-
 .../inference/batch_inference/vllm_batch.py   |  3 +-
 8 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/clients/python/llmengine/__init__.py b/clients/python/llmengine/__init__.py
index cc19aefd4..998388ac9 100644
--- a/clients/python/llmengine/__init__.py
+++ b/clients/python/llmengine/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.0.0b25"
+__version__ = "0.0.0b26"
 
 import os
 from typing import Sequence
diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 3a02f04e6..43d0813c7 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -10,6 +10,7 @@
     CreateBatchCompletionsRequest,
     CreateBatchCompletionsRequestContent,
     CreateBatchCompletionsResponse,
+    ToolConfig,
 )
 
 COMPLETION_TIMEOUT = 300
@@ -412,6 +413,7 @@ def batch_create(
         input_data_path: Optional[str] = None,
         data_parallelism: int = 1,
         max_runtime_sec: int = 24 * 3600,
+        tool_config: Optional[ToolConfig] = None,
     ) -> CreateBatchCompletionsResponse:
         """
         Creates a batch completion for the provided input data. The job runs offline and does not depend on an existing model endpoint.
@@ -437,6 +439,13 @@ def batch_create(
             max_runtime_sec (int):
                 The maximum runtime of the batch completion in seconds. Defaults to 24 hours.
 
+            tool_config (Optional[ToolConfig]):
+                Configuration for tool use.
+                NOTE: this config is highly experimental and signature will change significantly in future iterations.
+                Currently only Python code evaluator is supported.
+                Python code context starts with "\`\`\`python\\n" and ends with "\\n>>>\\n", data before "\\n\`\`\`\\n" and content end will be replaced by the Python execution results.
+                Please format prompts accordingly and provide examples so LLMs could properly generate Python code.
+
         Returns:
             response (CreateBatchCompletionsResponse): The response containing the job id.
 
@@ -480,6 +489,29 @@ def batch_create(
             )
             print(response.json())
             ```
+
+        === "Batch completions with prompts and use tool"
+            ```python
+            from llmengine import Completion
+            from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig
+
+            # Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path"
+
+            response = Completion.batch_create(
+                input_data_path="s3://my-input-path",
+                output_data_path="s3://my-output-path",
+                model_config=CreateBatchCompletionsModelConfig(
+                    model="llama-2-7b",
+                    checkpoint_path="s3://checkpoint-path",
+                    labels={"team":"my-team", "product":"my-product"}
+                ),
+                data_parallelism=2,
+                tool_config=ToolConfig(
+                    name="code_evaluator",
+                )
+            )
+            print(response.json())
+            ```
         """
         data = CreateBatchCompletionsRequest(
             model_config=model_config,
@@ -488,6 +520,7 @@ def batch_create(
             output_data_path=output_data_path,
             data_parallelism=data_parallelism,
             max_runtime_sec=max_runtime_sec,
+            tool_config=tool_config,
         ).dict()
         response = cls.post_sync(
             resource_name="v1/llm/batch-completions",
diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
index 209084aa8..06c0b805f 100644
--- a/clients/python/llmengine/data_types.py
+++ b/clients/python/llmengine/data_types.py
@@ -1,6 +1,7 @@
 """
 DTOs for LLM APIs.
 """
+
 import datetime
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Union
@@ -658,6 +659,30 @@ class CreateBatchCompletionsModelConfig(BaseModel):
     """
 
 
+class ToolConfig(BaseModel):
+    """
+    Configuration for tool use.
+    NOTE: this config is highly experimental and signature will change significantly in future iterations.
+    """
+
+    name: str
+    """
+    Name of the tool to use for the batch inference.
+    """
+    max_iterations: Optional[int] = 10
+    """
+    Maximum number of iterations to run the tool.
+    """
+    execution_timeout_seconds: Optional[int] = 60
+    """
+    Maximum runtime of the tool in seconds.
+    """
+    should_retry_on_error: Optional[bool] = True
+    """
+    Whether to retry the tool on error.
+    """
+
+
 class CreateBatchCompletionsRequest(BaseModel):
     """
     Request object for batch completions.
@@ -685,6 +710,11 @@ class CreateBatchCompletionsRequest(BaseModel):
     """
     Maximum runtime of the batch inference in seconds. Default to one day.
     """
+    tool_config: Optional[ToolConfig] = None
+    """
+    Configuration for tool use.
+    NOTE: this config is highly experimental and signature will change significantly in future iterations.
+    """
 
 
 class CreateBatchCompletionsResponse(BaseModel):
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index a0afe290c..a2fdc9ce0 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scale-llm-engine"
-version = "0.0.0.beta25"
+version = "0.0.0.beta26"
 description = "Scale LLM Engine Python client"
 license = "Apache-2.0"
 authors = ["Phil Chen <phil.chen@scale.com>"]
diff --git a/clients/python/setup.py b/clients/python/setup.py
index 961459dc0..9afe61362 100644
--- a/clients/python/setup.py
+++ b/clients/python/setup.py
@@ -3,6 +3,6 @@
 setup(
     name="scale-llm-engine",
     python_requires=">=3.7",
-    version="0.0.0.beta25",
+    version="0.0.0.beta26",
     packages=find_packages(),
 )
diff --git a/docs/api/data_types.md b/docs/api/data_types.md
index 206c93e6d..0576329c0 100644
--- a/docs/api/data_types.md
+++ b/docs/api/data_types.md
@@ -143,6 +143,7 @@
             - model_config
             - data_parallelism
             - max_runtime_sec
+            - tool_config
 
 ::: llmengine.CreateBatchCompletionsResponse
     options:
diff --git a/docs/guides/completions.md b/docs/guides/completions.md
index f48f05c4a..69dfe1bd5 100644
--- a/docs/guides/completions.md
+++ b/docs/guides/completions.md
@@ -122,7 +122,7 @@ asyncio.run(main())
 
 ## Batch completions
 
-The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_complete](../../api/python_client/#llmengine.completion.Completion.batch_complete) to utilize batch completions.
+The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) to utilize batch completions.
 
 Some examples of batch completions:
 
@@ -169,6 +169,30 @@ response = Completion.batch_create(
 print(response.job_id)
 ```
 
+=== "Batch completions with prompts and use tool"
+For how to properly use the tool please see [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) tool_config doc.
+```python
+from llmengine import Completion
+from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig
+
+# Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path"
+
+response = Completion.batch_create(
+    input_data_path="s3://my-input-path",
+    output_data_path="s3://my-output-path",
+    model_config=CreateBatchCompletionsModelConfig(
+        model="llama-2-7b",
+        checkpoint_path="s3://checkpoint-path",
+        labels={"team":"my-team", "product":"my-product"}
+    ),
+    data_parallelism=2,
+    tool_config=ToolConfig(
+        name="code_evaluator",
+    )
+)
+print(response.json())
+```
+
 ## Which model should I use?
 
 See the [Model Zoo](../../model_zoo) for more information on best practices for which model to use for Completions.
diff --git a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py
index d783b3f44..dced4e840 100644
--- a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py
+++ b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py
@@ -132,7 +132,7 @@ def get_vllm_engine(model, request):
         tensor_parallel_size=request.model_config.num_shards,
         seed=request.model_config.seed or 0,
         disable_log_requests=True,
-        gpu_memory_utilization=0.8,  # To avoid OOM errors when there's host machine GPU usage
+        gpu_memory_utilization=0.9,
     )
 
     llm = AsyncLLMEngine.from_engine_args(engine_args)
@@ -432,6 +432,7 @@ def check_unknown_startup_memory_usage():  # pragma: no cover
     """Check for unknown memory usage at startup."""
     gpu_free_memory = get_gpu_free_memory()
     if gpu_free_memory is not None:
+        print(f"GPU free memory at startup in MB: {gpu_free_memory}")
         min_mem = min(gpu_free_memory)
         max_mem = max(gpu_free_memory)
         if max_mem - min_mem > 10: