diff --git a/clients/python/llmengine/__init__.py b/clients/python/llmengine/__init__.py index cc19aefd4..998388ac9 100644 --- a/clients/python/llmengine/__init__.py +++ b/clients/python/llmengine/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.0.0b25" +__version__ = "0.0.0b26" import os from typing import Sequence diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 3a02f04e6..43d0813c7 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -10,6 +10,7 @@ CreateBatchCompletionsRequest, CreateBatchCompletionsRequestContent, CreateBatchCompletionsResponse, + ToolConfig, ) COMPLETION_TIMEOUT = 300 @@ -412,6 +413,7 @@ def batch_create( input_data_path: Optional[str] = None, data_parallelism: int = 1, max_runtime_sec: int = 24 * 3600, + tool_config: Optional[ToolConfig] = None, ) -> CreateBatchCompletionsResponse: """ Creates a batch completion for the provided input data. The job runs offline and does not depend on an existing model endpoint. @@ -437,6 +439,13 @@ def batch_create( max_runtime_sec (int): The maximum runtime of the batch completion in seconds. Defaults to 24 hours. + tool_config (Optional[ToolConfig]): + Configuration for tool use. + NOTE: this config is highly experimental and signature will change significantly in future iterations. + Currently only Python code evaluator is supported. + Python code context starts with "\`\`\`python\\n" and ends with "\\n>>>\\n", data before "\\n\`\`\`\\n" and content end will be replaced by the Python execution results. + Please format prompts accordingly and provide examples so LLMs could properly generate Python code. + Returns: response (CreateBatchCompletionsResponse): The response containing the job id. @@ -480,6 +489,29 @@ def batch_create( ) print(response.json()) ``` + + === "Batch completions with prompts and use tool" + ```python + from llmengine import Completion + from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig + + # Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path" + + response = Completion.batch_create( + input_data_path="s3://my-input-path", + output_data_path="s3://my-output-path", + model_config=CreateBatchCompletionsModelConfig( + model="llama-2-7b", + checkpoint_path="s3://checkpoint-path", + labels={"team":"my-team", "product":"my-product"} + ), + data_parallelism=2, + tool_config=ToolConfig( + name="code_evaluator", + ) + ) + print(response.json()) + ``` """ data = CreateBatchCompletionsRequest( model_config=model_config, @@ -488,6 +520,7 @@ def batch_create( output_data_path=output_data_path, data_parallelism=data_parallelism, max_runtime_sec=max_runtime_sec, + tool_config=tool_config, ).dict() response = cls.post_sync( resource_name="v1/llm/batch-completions", diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py index 209084aa8..06c0b805f 100644 --- a/clients/python/llmengine/data_types.py +++ b/clients/python/llmengine/data_types.py @@ -1,6 +1,7 @@ """ DTOs for LLM APIs. """ + import datetime from enum import Enum from typing import Any, Dict, List, Literal, Optional, Union @@ -658,6 +659,30 @@ class CreateBatchCompletionsModelConfig(BaseModel): """ +class ToolConfig(BaseModel): + """ + Configuration for tool use. + NOTE: this config is highly experimental and signature will change significantly in future iterations. + """ + + name: str + """ + Name of the tool to use for the batch inference. + """ + max_iterations: Optional[int] = 10 + """ + Maximum number of iterations to run the tool. + """ + execution_timeout_seconds: Optional[int] = 60 + """ + Maximum runtime of the tool in seconds. + """ + should_retry_on_error: Optional[bool] = True + """ + Whether to retry the tool on error. + """ + + class CreateBatchCompletionsRequest(BaseModel): """ Request object for batch completions. @@ -685,6 +710,11 @@ class CreateBatchCompletionsRequest(BaseModel): """ Maximum runtime of the batch inference in seconds. Default to one day. """ + tool_config: Optional[ToolConfig] = None + """ + Configuration for tool use. + NOTE: this config is highly experimental and signature will change significantly in future iterations. + """ class CreateBatchCompletionsResponse(BaseModel): diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index a0afe290c..a2fdc9ce0 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scale-llm-engine" -version = "0.0.0.beta25" +version = "0.0.0.beta26" description = "Scale LLM Engine Python client" license = "Apache-2.0" authors = ["Phil Chen "] diff --git a/clients/python/setup.py b/clients/python/setup.py index 961459dc0..9afe61362 100644 --- a/clients/python/setup.py +++ b/clients/python/setup.py @@ -3,6 +3,6 @@ setup( name="scale-llm-engine", python_requires=">=3.7", - version="0.0.0.beta25", + version="0.0.0.beta26", packages=find_packages(), ) diff --git a/docs/api/data_types.md b/docs/api/data_types.md index 206c93e6d..0576329c0 100644 --- a/docs/api/data_types.md +++ b/docs/api/data_types.md @@ -143,6 +143,7 @@ - model_config - data_parallelism - max_runtime_sec + - tool_config ::: llmengine.CreateBatchCompletionsResponse options: diff --git a/docs/guides/completions.md b/docs/guides/completions.md index f48f05c4a..69dfe1bd5 100644 --- a/docs/guides/completions.md +++ b/docs/guides/completions.md @@ -122,7 +122,7 @@ asyncio.run(main()) ## Batch completions -The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_complete](../../api/python_client/#llmengine.completion.Completion.batch_complete) to utilize batch completions. +The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) to utilize batch completions. Some examples of batch completions: @@ -169,6 +169,30 @@ response = Completion.batch_create( print(response.job_id) ``` +=== "Batch completions with prompts and use tool" +For how to properly use the tool please see [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) tool_config doc. +```python +from llmengine import Completion +from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig + +# Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path" + +response = Completion.batch_create( + input_data_path="s3://my-input-path", + output_data_path="s3://my-output-path", + model_config=CreateBatchCompletionsModelConfig( + model="llama-2-7b", + checkpoint_path="s3://checkpoint-path", + labels={"team":"my-team", "product":"my-product"} + ), + data_parallelism=2, + tool_config=ToolConfig( + name="code_evaluator", + ) +) +print(response.json()) +``` + ## Which model should I use? See the [Model Zoo](../../model_zoo) for more information on best practices for which model to use for Completions. diff --git a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py index d783b3f44..dced4e840 100644 --- a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py +++ b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py @@ -132,7 +132,7 @@ def get_vllm_engine(model, request): tensor_parallel_size=request.model_config.num_shards, seed=request.model_config.seed or 0, disable_log_requests=True, - gpu_memory_utilization=0.8, # To avoid OOM errors when there's host machine GPU usage + gpu_memory_utilization=0.9, ) llm = AsyncLLMEngine.from_engine_args(engine_args) @@ -432,6 +432,7 @@ def check_unknown_startup_memory_usage(): # pragma: no cover """Check for unknown memory usage at startup.""" gpu_free_memory = get_gpu_free_memory() if gpu_free_memory is not None: + print(f"GPU free memory at startup in MB: {gpu_free_memory}") min_mem = min(gpu_free_memory) max_mem = max(gpu_free_memory) if max_mem - min_mem > 10: