Skip to content

Commit

Permalink
Change back batch infer GPU util and add tool completion client chang…
Browse files Browse the repository at this point in the history
…es (#465)

* Change back batch infer gpu util

* Add client changes

* fixes

* bump
  • Loading branch information
yunfeng-scale authored Mar 8, 2024
1 parent 659d08d commit bfcfbba
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 5 deletions.
2 changes: 1 addition & 1 deletion clients/python/llmengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.0.0b25"
__version__ = "0.0.0b26"

import os
from typing import Sequence
Expand Down
33 changes: 33 additions & 0 deletions clients/python/llmengine/completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
CreateBatchCompletionsRequest,
CreateBatchCompletionsRequestContent,
CreateBatchCompletionsResponse,
ToolConfig,
)

COMPLETION_TIMEOUT = 300
Expand Down Expand Up @@ -412,6 +413,7 @@ def batch_create(
input_data_path: Optional[str] = None,
data_parallelism: int = 1,
max_runtime_sec: int = 24 * 3600,
tool_config: Optional[ToolConfig] = None,
) -> CreateBatchCompletionsResponse:
"""
Creates a batch completion for the provided input data. The job runs offline and does not depend on an existing model endpoint.
Expand All @@ -437,6 +439,13 @@ def batch_create(
max_runtime_sec (int):
The maximum runtime of the batch completion in seconds. Defaults to 24 hours.
tool_config (Optional[ToolConfig]):
Configuration for tool use.
NOTE: this config is highly experimental and signature will change significantly in future iterations.
Currently only Python code evaluator is supported.
Python code context starts with "\`\`\`python\\n" and ends with "\\n>>>\\n", data before "\\n\`\`\`\\n" and content end will be replaced by the Python execution results.
Please format prompts accordingly and provide examples so LLMs could properly generate Python code.
Returns:
response (CreateBatchCompletionsResponse): The response containing the job id.
Expand Down Expand Up @@ -480,6 +489,29 @@ def batch_create(
)
print(response.json())
```
=== "Batch completions with prompts and use tool"
```python
from llmengine import Completion
from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig
# Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path"
response = Completion.batch_create(
input_data_path="s3://my-input-path",
output_data_path="s3://my-output-path",
model_config=CreateBatchCompletionsModelConfig(
model="llama-2-7b",
checkpoint_path="s3://checkpoint-path",
labels={"team":"my-team", "product":"my-product"}
),
data_parallelism=2,
tool_config=ToolConfig(
name="code_evaluator",
)
)
print(response.json())
```
"""
data = CreateBatchCompletionsRequest(
model_config=model_config,
Expand All @@ -488,6 +520,7 @@ def batch_create(
output_data_path=output_data_path,
data_parallelism=data_parallelism,
max_runtime_sec=max_runtime_sec,
tool_config=tool_config,
).dict()
response = cls.post_sync(
resource_name="v1/llm/batch-completions",
Expand Down
30 changes: 30 additions & 0 deletions clients/python/llmengine/data_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
DTOs for LLM APIs.
"""

import datetime
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
Expand Down Expand Up @@ -658,6 +659,30 @@ class CreateBatchCompletionsModelConfig(BaseModel):
"""


class ToolConfig(BaseModel):
"""
Configuration for tool use.
NOTE: this config is highly experimental and signature will change significantly in future iterations.
"""

name: str
"""
Name of the tool to use for the batch inference.
"""
max_iterations: Optional[int] = 10
"""
Maximum number of iterations to run the tool.
"""
execution_timeout_seconds: Optional[int] = 60
"""
Maximum runtime of the tool in seconds.
"""
should_retry_on_error: Optional[bool] = True
"""
Whether to retry the tool on error.
"""


class CreateBatchCompletionsRequest(BaseModel):
"""
Request object for batch completions.
Expand Down Expand Up @@ -685,6 +710,11 @@ class CreateBatchCompletionsRequest(BaseModel):
"""
Maximum runtime of the batch inference in seconds. Default to one day.
"""
tool_config: Optional[ToolConfig] = None
"""
Configuration for tool use.
NOTE: this config is highly experimental and signature will change significantly in future iterations.
"""


class CreateBatchCompletionsResponse(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scale-llm-engine"
version = "0.0.0.beta25"
version = "0.0.0.beta26"
description = "Scale LLM Engine Python client"
license = "Apache-2.0"
authors = ["Phil Chen <[email protected]>"]
Expand Down
2 changes: 1 addition & 1 deletion clients/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
setup(
name="scale-llm-engine",
python_requires=">=3.7",
version="0.0.0.beta25",
version="0.0.0.beta26",
packages=find_packages(),
)
1 change: 1 addition & 0 deletions docs/api/data_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
- model_config
- data_parallelism
- max_runtime_sec
- tool_config

::: llmengine.CreateBatchCompletionsResponse
options:
Expand Down
26 changes: 25 additions & 1 deletion docs/guides/completions.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ asyncio.run(main())

## Batch completions

The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_complete](../../api/python_client/#llmengine.completion.Completion.batch_complete) to utilize batch completions.
The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) to utilize batch completions.

Some examples of batch completions:

Expand Down Expand Up @@ -169,6 +169,30 @@ response = Completion.batch_create(
print(response.job_id)
```

=== "Batch completions with prompts and use tool"
For how to properly use the tool please see [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) tool_config doc.
```python
from llmengine import Completion
from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig

# Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path"

response = Completion.batch_create(
input_data_path="s3://my-input-path",
output_data_path="s3://my-output-path",
model_config=CreateBatchCompletionsModelConfig(
model="llama-2-7b",
checkpoint_path="s3://checkpoint-path",
labels={"team":"my-team", "product":"my-product"}
),
data_parallelism=2,
tool_config=ToolConfig(
name="code_evaluator",
)
)
print(response.json())
```

## Which model should I use?

See the [Model Zoo](../../model_zoo) for more information on best practices for which model to use for Completions.
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def get_vllm_engine(model, request):
tensor_parallel_size=request.model_config.num_shards,
seed=request.model_config.seed or 0,
disable_log_requests=True,
gpu_memory_utilization=0.8, # To avoid OOM errors when there's host machine GPU usage
gpu_memory_utilization=0.9,
)

llm = AsyncLLMEngine.from_engine_args(engine_args)
Expand Down Expand Up @@ -432,6 +432,7 @@ def check_unknown_startup_memory_usage(): # pragma: no cover
"""Check for unknown memory usage at startup."""
gpu_free_memory = get_gpu_free_memory()
if gpu_free_memory is not None:
print(f"GPU free memory at startup in MB: {gpu_free_memory}")
min_mem = min(gpu_free_memory)
max_mem = max(gpu_free_memory)
if max_mem - min_mem > 10:
Expand Down

0 comments on commit bfcfbba

Please sign in to comment.