-
Notifications
You must be signed in to change notification settings - Fork 68
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ir-llm benchmark workflow (#2520)
- Loading branch information
Showing
18 changed files
with
1,234 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
name: SageMaker LLM Benchmark | ||
|
||
on: | ||
schedule: | ||
- cron: '0 17 * * 5' # Run every Friday at 5pm | ||
|
||
jobs: | ||
create-runners: | ||
runs-on: [self-hosted, scheduler] | ||
steps: | ||
- name: Create new CPU instance | ||
id: create_cpu1 | ||
run: | | ||
cd /home/ubuntu/djl_benchmark_script/scripts | ||
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ | ||
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ | ||
--fail \ | ||
| jq '.token' | tr -d '"' ) | ||
./start_instance.sh action_cpu $token djl-serving | ||
outputs: | ||
cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }} | ||
|
||
endpoint-tests: | ||
runs-on: [ self-hosted, cpu ] | ||
timeout-minutes: 120 | ||
needs: create-runners | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
engine: [lmi-dist, trtllm] | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Set up Python3 | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.10.x' | ||
- name: Install pip dependencies | ||
run: pip3 install -U boto3 awscli | ||
- name: Configure AWS Credentials | ||
uses: aws-actions/configure-aws-credentials@v4 | ||
with: | ||
role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving | ||
aws-region: us-west-2 | ||
- name: Install IR-LLM | ||
working-directory: tests/integration/benchmark/ir-llm | ||
run: | | ||
aws s3 cp s3://sagemaker-python-sdk-ir-llm/sagemaker-latest.dev0-py3-none-any.whl . | ||
aws s3 cp s3://sagemaker-python-sdk-ir-llm/sagemaker.normal.json . | ||
aws s3 cp s3://djl-accounts/hf_token . | ||
export HF_TOKEN=$(head -n1 hf_token|cut -d '=' -f2) | ||
pip3 install --quiet sagemaker-latest.dev0-py3-none-any.whl | ||
aws configure add-model --service-model file://sagemaker.normal.json --service-name sagemaker | ||
- name: Prepare dir to store metrics | ||
working-directory: tests/integration/benchmark/ir-llm | ||
run: | | ||
METRICS_DIR = "metrics_${{ matrix.engine}}" | ||
if [ ! -d "${METRICS_DIR}" ]; then | ||
mkdir -p "${METRICS_DIR}" | ||
echo "Directory ${METRICS_DIR} created." | ||
else | ||
echo "Directory ${METRICS_DIR} already exists." | ||
fi | ||
- name: Run IR-LLM | ||
run: | | ||
METRICS_DIR = "metrics_${{ matrix.engine}}" | ||
CONFIG_DIR = "config/${{ matrix.engine}}" | ||
python3 scripts/cw_metrics.py -j $CONFIG_DIR}/config.yml -c ${CONFIG_DIR}/config_ir_job -m ${METRICS_DIR} | ||
echo "sleep 30 seconds to allow endpoint deletion" | ||
sleep 30 | ||
stop-runners: | ||
if: always() | ||
runs-on: [ self-hosted, scheduler ] | ||
needs: [ create-runners, endpoint-tests ] | ||
steps: | ||
- name: Cleanup dangling SageMaker resources | ||
run: | | ||
cd /home/ubuntu/djl_benchmark_script/scripts | ||
./cleanup_sagemaker_resources.sh sm-integration-test us-west-2 | ||
- name: Stop all instances | ||
run: | | ||
cd /home/ubuntu/djl_benchmark_script/scripts | ||
instance_id=${{ needs.create-runners.outputs.cpu_instance_id1 }} | ||
./stop_instance.sh $instance_id | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# IR-LLM Benchmark on SageMaker Endpoint | ||
This folder contains scripts and configurations to run benchmarks on SageMaker Endpoints using the IR-LLM. | ||
|
||
|
||
## Usage | ||
Run the benchmark script: | ||
``` | ||
cd tests/integration/benchmark/ir-llm | ||
python cw_metrics.py -j config.yml -c ./configs -m ./metrics | ||
``` | ||
This command runs the benchmark script cw_metrics.py with the following arguments: | ||
|
||
-j config.yml: Specifies the main configuration file config.yml. | ||
-c ./configs: Specifies the directory containing the IR-LLM configuration files for each model's test case. | ||
-m ./metrics: Specifies the directory where the benchmark reports will be saved. | ||
|
||
## Configuration | ||
### config.yml | ||
The config.yml file defines the overall benchmark configuration, including: | ||
|
||
* cloudwatch_metrics_namespace: The CloudWatch namespace for the metrics. | ||
* metrics_definitions: A list of metric definitions to be collected during the benchmark. | ||
* benchmark_report_s3_location: The S3 location where the benchmark reports will be stored. | ||
* model_test_cases: A list of model test cases to be benchmarked. | ||
|
||
An example can be found at: ./config/lmi-dist/config.yml | ||
|
||
### benchmark_config_xxx.json | ||
The xxx.json files in the configs directory define the IR-LLM configuration for each model's test case. | ||
|
||
An example can be found at: ./config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json | ||
|
||
## Benchmark Reports | ||
After running the benchmark, the reports will be saved in the specified S3 location. The reports will contain detailed metrics and performance data for each benchmarked model test case. |
133 changes: 133 additions & 0 deletions
133
tests/integration/benchmark/ir-llm/config/lmi-dist/config.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
region: "us-west-2" | ||
|
||
cloudwatch: | ||
metrics_namespace: "SageMaker_LLM_Benchmark" | ||
|
||
s3: | ||
bucket_name: "djl-benchmark" | ||
folder: "sm-lmi-dist" | ||
|
||
metrics: | ||
timeToFirstToken_p50: | ||
metric_name: "TTFT_P50" | ||
unit: "Milliseconds" | ||
|
||
timeToFirstToken_p99: | ||
metric_name: "TTFT_P99" | ||
unit: "Milliseconds" | ||
|
||
intertokenLatency_p50: | ||
metric_name: "InterTokenLatency_P50" | ||
unit: "Milliseconds" | ||
|
||
intertokenLatency_p99: | ||
metric_name: "InterTokenLatency_P99" | ||
unit: "Milliseconds" | ||
|
||
costPerMillionInputTokens: | ||
metric_name: "CostPerMillionInputTokens" | ||
unit: "Count" | ||
|
||
costPerMillionOutputTokens: | ||
metric_name: "CostPerMillionOutputTokens" | ||
unit: "None" | ||
|
||
tokenizerFailed_Sum: | ||
metric_name: "TokenizerErrorRate" | ||
unit: "Percent" | ||
|
||
numberOfInputTokens_p50: | ||
metric_name: "NumberOfInputTokens_p50" | ||
unit: "None" | ||
|
||
numberOfInputTokens_p99: | ||
metric_name: "NumberOfInputTokens_p99" | ||
unit: "None" | ||
|
||
numberOfOutputTokens_p50: | ||
metric_name: "NumberOfOutputTokens_p50" | ||
unit: "None" | ||
|
||
numberOfOutputTokens_p99: | ||
metric_name: "NumberOfOutputTokens_p99" | ||
unit: "None" | ||
|
||
clientInvocationErrors_Sum: | ||
metric_name: "ClientInvocationErrorRate" | ||
unit: "Percent" | ||
|
||
emptyInferenceResponse_Sum: | ||
metric_name: "EmptyInferenceResponseRate" | ||
unit: "Percent" | ||
|
||
benchmarks: | ||
- model: "Llama-3.1-8b" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_passive_Llama-3-1-8b.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz" | ||
action: yes | ||
- model: "Llama-3.1-8b-suzuka" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_LMI_V12_Llama-3-1-8b-suzuka.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-8b-instruct" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_passive_Llama-3-1-8b-instruct.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-8b-instruct-suzuka" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_LMI_V12_Llama-3-1-8b-instruct-suzuka.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-70b" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_passive_Llama-3-1-70b.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-70b-suzuka" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_LMI_V12_Llama-3-1-70b-suzuka.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-70b-instruct" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_passive_Llama-3-1-70b-instruct.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-70b-instruct-suzuka" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_LMI_V12_Llama-3-1-70b-instruct-suzuka.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-405b-fp8" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_passive_Llama-3-1-405b-fp8.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz" | ||
action: no | ||
- model: "Llama-3.1-405b-instruct-fp8" | ||
endpoints: | ||
- endpoint: "sagemaker" | ||
image: "LMI-dist" | ||
config: "benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json" | ||
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz" | ||
action: no |
36 changes: 36 additions & 0 deletions
36
...ark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-FP8", | ||
"jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-fp8", | ||
"use_jumpstart_prod_artifact": true, | ||
"image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", | ||
"image_uri_args": { | ||
"framework": "djl-lmi", | ||
"version": "nightly" | ||
}, | ||
"model_args": { | ||
"env": { | ||
"HF_MODEL_ID": "/opt/ml/model/", | ||
"OPTION_MAX_MODEL_LEN": "8192", | ||
"OPTION_USE_PASSIVE_WORKERS": "true" | ||
}, | ||
"enable_network_isolation": true | ||
}, | ||
"benchmark_configurations": [ | ||
{ | ||
"instance_type": "ml.p4d.24xlarge", | ||
"env_params": { | ||
"TENSOR_PARALLEL_DEGREE": [ | ||
"8" | ||
] | ||
} | ||
}, | ||
{ | ||
"instance_type": "ml.p5.48xlarge", | ||
"env_params": { | ||
"TENSOR_PARALLEL_DEGREE": [ | ||
"8" | ||
] | ||
} | ||
} | ||
] | ||
} |
28 changes: 28 additions & 0 deletions
28
...m/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
{ | ||
"tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8", | ||
"jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-instruct-fp8", | ||
"use_jumpstart_prod_artifact": true, | ||
"image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", | ||
"image_uri_args": { | ||
"framework": "djl-lmi", | ||
"version": "nightly" | ||
}, | ||
"model_args": { | ||
"env": { | ||
"HF_MODEL_ID": "/opt/ml/model/", | ||
"OPTION_MAX_MODEL_LEN": "8192", | ||
"OPTION_USE_PASSIVE_WORKERS": "true" | ||
}, | ||
"enable_network_isolation": true | ||
}, | ||
"benchmark_configurations": [ | ||
{ | ||
"instance_type": "ml.p5.48xlarge", | ||
"env_params": { | ||
"TENSOR_PARALLEL_DEGREE": [ | ||
"8" | ||
] | ||
} | ||
} | ||
] | ||
} |
52 changes: 52 additions & 0 deletions
52
...ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b-instruct.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
{ | ||
"tokenizer_model_id": "meta-llama/Meta-Llama-3-70B-Instruct", | ||
"jumpstart_model_id": "meta-textgeneration-llama-3-1-70b-instruct", | ||
"use_jumpstart_prod_artifact": true, | ||
"image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly", | ||
"image_uri_args": { | ||
"framework": "djl-lmi", | ||
"version": "nightly" | ||
}, | ||
"model_args": { | ||
"env": { | ||
"HF_MODEL_ID": "/opt/ml/model/", | ||
"OPTION_MAX_MODEL_LEN": "8192", | ||
"OPTION_USE_PASSIVE_WORKERS": "true" | ||
}, | ||
"enable_network_isolation": true | ||
}, | ||
"benchmark_configurations": [ | ||
{ | ||
"instance_type": "ml.g5.48xlarge", | ||
"env_params": { | ||
"TENSOR_PARALLEL_DEGREE": [ | ||
"8" | ||
] | ||
} | ||
}, | ||
{ | ||
"instance_type": "ml.g6.48xlarge", | ||
"env_params": { | ||
"TENSOR_PARALLEL_DEGREE": [ | ||
"8" | ||
] | ||
} | ||
}, | ||
{ | ||
"instance_type": "ml.p4d.24xlarge", | ||
"env_params": { | ||
"TENSOR_PARALLEL_DEGREE": [ | ||
"8" | ||
] | ||
} | ||
}, | ||
{ | ||
"instance_type": "ml.p5.48xlarge", | ||
"env_params": { | ||
"TENSOR_PARALLEL_DEGREE": [ | ||
"8" | ||
] | ||
} | ||
} | ||
] | ||
} |
Oops, something went wrong.