Skip to content

Commit

Permalink
[fix] Context length estimate datatype (#1350)
Browse files Browse the repository at this point in the history
  • Loading branch information
sindhuvahinis authored Dec 1, 2023
1 parent 879537e commit 1b4f2d1
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import logging
import os
import re
from typing import Optional
from typing import Optional, Union, List

from pydantic import validator, root_validator
from enum import IntEnum, Enum
Expand Down Expand Up @@ -50,7 +50,7 @@ class TransformerNeuronXProperties(Properties):
load_in_8bit: Optional[bool] = False
low_cpu_mem_usage: bool = False
load_split_model: bool = False
context_length_estimate: Optional[dict] = None
context_length_estimate: Optional[List[int]] = None
amp: Optional[str] = None
quantize: Optional[TnXQuantizeMethods] = None
compiled_graph_path: Optional[str] = None
Expand All @@ -62,7 +62,10 @@ def set_neuron_optimal_env(cls, level):

@validator('context_length_estimate', pre=True)
def parse_context_length(cls, context_length_estimate):
return json.loads(context_length_estimate)
return [
int(context_length)
for context_length in context_length_estimate.split(',')
]

@validator('rolling_batch', pre=True)
def validate_rolling_batch(cls, rolling_batch: str) -> str:
Expand Down
41 changes: 25 additions & 16 deletions engines/python/setup/djl_python/tests/test_properties_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,12 @@ def test_tnx_all_configs(self):
# TODO: Replace with actual example of context_length_estimate

properties = {
"n_positions":
"2048",
"load_split_model":
"true",
"load_in_8bit":
"true",
"compiled_graph_path":
"s3://test/bucket/folder",
"low_cpu_mem_usage":
"true",
'context_length_estimate':
'{"context_length": "128", "variable_size": "12"}'
"n_positions": "2048",
"load_split_model": "true",
"load_in_8bit": "true",
"compiled_graph_path": "s3://test/bucket/folder",
"low_cpu_mem_usage": "true",
'context_length_estimate': '256, 512, 1024'
}
tnx_configs = TransformerNeuronXProperties(**common_properties,
**properties)
Expand All @@ -128,10 +122,18 @@ def test_tnx_all_configs(self):
self.assertTrue(tnx_configs.load_in_8bit)
self.assertTrue(tnx_configs.low_cpu_mem_usage)

self.assertDictEqual(tnx_configs.context_length_estimate, {
'context_length': '128',
'variable_size': '12'
})
self.assertListEqual(tnx_configs.context_length_estimate,
[256, 512, 1024])

# tests context length estimate as integer
def test_tnx_cle_int(context_length_estimate):
properties['context_length_estimate'] = context_length_estimate
configs = TransformerNeuronXProperties(**common_properties,
**properties)
self.assertEqual(configs.context_length_estimate, [256])
del properties['context_length_estimate']

test_tnx_cle_int('256')

def test_tnx_configs_error_case(self):
properties = {
Expand All @@ -152,8 +154,15 @@ def test_non_existent_directory(directory):
TransformerNeuronXProperties(**common_properties, **properties)
del properties['compiled_graph_path']

def test_invalid_context_length(context_length_estimate):
properties['context_length_estimate'] = context_length_estimate
with self.assertRaises(ValueError):
TransformerNeuronXProperties(**common_properties, **properties)
del properties['context_length_estimate']

test_url_not_s3_uri("https://random.url.address/")
test_non_existent_directory("not_a_directory")
test_invalid_context_length("invalid")

def test_trtllm_configs(self):
properties = {
Expand Down
20 changes: 10 additions & 10 deletions serving/docs/lmi/configurations_large_model_inference_containers.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,17 @@ If you specify Engine to be MPI, rolling_batch to vllm in DeepSpeed container, t

If you are using Neuron container and engine set to Python, the following parameter will be accessible.

| Item | Required | Description | Example value |
|--------------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------|
| option.n_positions | No | Input sequence length | Default: `128` |
| option.load_in_8bit | No | Specify this option to quantize your model using the supported quantization methods in TransformerNeuronX | `False`, `True` Default: `False` |
| Item | Required | Description | Example value |
|--------------------------------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------|
| option.n_positions | No | Input sequence length | Default: `128` |
| option.load_in_8bit | No | Specify this option to quantize your model using the supported quantization methods in TransformerNeuronX | `False`, `True` Default: `False` |
| Advanced parameters |
| option.unroll | No | Unroll the model graph for compilation. With `unroll=None` compiler will have more opportunities to do optimizations across the layers | Default: `None` |
| option.neuron_optimize_level | No | Neuron runtime compiler optimization level, determines the type of optimizations applied during compilation. The higher optimize level we go, the longer time will spend on compilation. But in exchange, you will get better latency/throughput. Default value is not set (optimize level 2) that have a balance of compilation time and performance | `1`,`2`,`3` Default: `2` |
| option.context_length_estimate | No | Estimated context input length for Llama models. Customer can specify different size bucket to increase the KV cache re-usability. This will help to improve latency | Default: `None` |
| option.low_cpu_mem_usage | No | Reduce CPU memory usage when loading models. | Default: `False` |
| option.load_split_model | No | Toggle to True when using model artifacts that have already been split for neuron compilation/loading. | Default: `False` |
| option.compiled_graph_path | No | Provide an s3 URI, or a local directory that stores the pre-compiled graph for your model (NEFF cache) to skip runtime compilation. | Default: `None` |
| option.unroll | No | Unroll the model graph for compilation. With `unroll=None` compiler will have more opportunities to do optimizations across the layers | Default: `None` |
| option.neuron_optimize_level | No | Neuron runtime compiler optimization level, determines the type of optimizations applied during compilation. The higher optimize level we go, the longer time will spend on compilation. But in exchange, you will get better latency/throughput. Default value is not set (optimize level 2) that have a balance of compilation time and performance | `1`,`2`,`3` Default: `2` |
| option.context_length_estimate | No | Estimated context input length for Llama models. Customer can specify different size bucket to increase the KV cache re-usability. This will help to improve latency | Example: `256,512,1024` (integers separated by comma if multiple values) <br/> Default: `None` |
| option.low_cpu_mem_usage | No | Reduce CPU memory usage when loading models. | Default: `False` |
| option.load_split_model | No | Toggle to True when using model artifacts that have already been split for neuron compilation/loading. | Default: `False` |
| option.compiled_graph_path | No | Provide an s3 URI, or a local directory that stores the pre-compiled graph for your model (NEFF cache) to skip runtime compilation. | Default: `None` |


### TensorRT-LLM
Expand Down
1 change: 1 addition & 0 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@
"option.n_positions": 512,
"option.model_loading_timeout": 2400,
"option.load_split_model": True,
"option.context_length_estimate": '256, 512, 1024'
},
"opt-1.3b-streaming": {
"option.model_id": "s3://djl-llm/opt-1.3b/",
Expand Down

0 comments on commit 1b4f2d1

Please sign in to comment.