Skip to content

Commit

Permalink
Add ir-llm benchmark workflow (#2520)
Browse files Browse the repository at this point in the history
  • Loading branch information
lxning authored Nov 14, 2024
1 parent c5f1efc commit b9e1c7d
Show file tree
Hide file tree
Showing 18 changed files with 1,234 additions and 0 deletions.
86 changes: 86 additions & 0 deletions .github/workflows/sagemaker_llm_benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
name: SageMaker LLM Benchmark

on:
schedule:
- cron: '0 17 * * 5' # Run every Friday at 5pm

jobs:
create-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new CPU instance
id: create_cpu1
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
outputs:
cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}

endpoint-tests:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
strategy:
fail-fast: false
matrix:
engine: [lmi-dist, trtllm]
steps:
- uses: actions/checkout@v4
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install -U boto3 awscli
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
aws-region: us-west-2
- name: Install IR-LLM
working-directory: tests/integration/benchmark/ir-llm
run: |
aws s3 cp s3://sagemaker-python-sdk-ir-llm/sagemaker-latest.dev0-py3-none-any.whl .
aws s3 cp s3://sagemaker-python-sdk-ir-llm/sagemaker.normal.json .
aws s3 cp s3://djl-accounts/hf_token .
export HF_TOKEN=$(head -n1 hf_token|cut -d '=' -f2)
pip3 install --quiet sagemaker-latest.dev0-py3-none-any.whl
aws configure add-model --service-model file://sagemaker.normal.json --service-name sagemaker
- name: Prepare dir to store metrics
working-directory: tests/integration/benchmark/ir-llm
run: |
METRICS_DIR = "metrics_${{ matrix.engine}}"
if [ ! -d "${METRICS_DIR}" ]; then
mkdir -p "${METRICS_DIR}"
echo "Directory ${METRICS_DIR} created."
else
echo "Directory ${METRICS_DIR} already exists."
fi
- name: Run IR-LLM
run: |
METRICS_DIR = "metrics_${{ matrix.engine}}"
CONFIG_DIR = "config/${{ matrix.engine}}"
python3 scripts/cw_metrics.py -j $CONFIG_DIR}/config.yml -c ${CONFIG_DIR}/config_ir_job -m ${METRICS_DIR}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, endpoint-tests ]
steps:
- name: Cleanup dangling SageMaker resources
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
./cleanup_sagemaker_resources.sh sm-integration-test us-west-2
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.cpu_instance_id1 }}
./stop_instance.sh $instance_id
34 changes: 34 additions & 0 deletions tests/integration/benchmark/ir-llm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# IR-LLM Benchmark on SageMaker Endpoint
This folder contains scripts and configurations to run benchmarks on SageMaker Endpoints using the IR-LLM.


## Usage
Run the benchmark script:
```
cd tests/integration/benchmark/ir-llm
python cw_metrics.py -j config.yml -c ./configs -m ./metrics
```
This command runs the benchmark script cw_metrics.py with the following arguments:

-j config.yml: Specifies the main configuration file config.yml.
-c ./configs: Specifies the directory containing the IR-LLM configuration files for each model's test case.
-m ./metrics: Specifies the directory where the benchmark reports will be saved.

## Configuration
### config.yml
The config.yml file defines the overall benchmark configuration, including:

* cloudwatch_metrics_namespace: The CloudWatch namespace for the metrics.
* metrics_definitions: A list of metric definitions to be collected during the benchmark.
* benchmark_report_s3_location: The S3 location where the benchmark reports will be stored.
* model_test_cases: A list of model test cases to be benchmarked.

An example can be found at: ./config/lmi-dist/config.yml

### benchmark_config_xxx.json
The xxx.json files in the configs directory define the IR-LLM configuration for each model's test case.

An example can be found at: ./config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json

## Benchmark Reports
After running the benchmark, the reports will be saved in the specified S3 location. The reports will contain detailed metrics and performance data for each benchmarked model test case.
133 changes: 133 additions & 0 deletions tests/integration/benchmark/ir-llm/config/lmi-dist/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
region: "us-west-2"

cloudwatch:
metrics_namespace: "SageMaker_LLM_Benchmark"

s3:
bucket_name: "djl-benchmark"
folder: "sm-lmi-dist"

metrics:
timeToFirstToken_p50:
metric_name: "TTFT_P50"
unit: "Milliseconds"

timeToFirstToken_p99:
metric_name: "TTFT_P99"
unit: "Milliseconds"

intertokenLatency_p50:
metric_name: "InterTokenLatency_P50"
unit: "Milliseconds"

intertokenLatency_p99:
metric_name: "InterTokenLatency_P99"
unit: "Milliseconds"

costPerMillionInputTokens:
metric_name: "CostPerMillionInputTokens"
unit: "Count"

costPerMillionOutputTokens:
metric_name: "CostPerMillionOutputTokens"
unit: "None"

tokenizerFailed_Sum:
metric_name: "TokenizerErrorRate"
unit: "Percent"

numberOfInputTokens_p50:
metric_name: "NumberOfInputTokens_p50"
unit: "None"

numberOfInputTokens_p99:
metric_name: "NumberOfInputTokens_p99"
unit: "None"

numberOfOutputTokens_p50:
metric_name: "NumberOfOutputTokens_p50"
unit: "None"

numberOfOutputTokens_p99:
metric_name: "NumberOfOutputTokens_p99"
unit: "None"

clientInvocationErrors_Sum:
metric_name: "ClientInvocationErrorRate"
unit: "Percent"

emptyInferenceResponse_Sum:
metric_name: "EmptyInferenceResponseRate"
unit: "Percent"

benchmarks:
- model: "Llama-3.1-8b"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_passive_Llama-3-1-8b.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz"
action: yes
- model: "Llama-3.1-8b-suzuka"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_LMI_V12_Llama-3-1-8b-suzuka.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-8b-instruct"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_passive_Llama-3-1-8b-instruct.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-8b-instruct-suzuka"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_LMI_V12_Llama-3-1-8b-instruct-suzuka.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-70b"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_passive_Llama-3-1-70b.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-70b-suzuka"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_LMI_V12_Llama-3-1-70b-suzuka.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-70b-instruct"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_passive_Llama-3-1-70b-instruct.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-70b-instruct-suzuka"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_LMI_V12_Llama-3-1-70b-instruct-suzuka.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-405b-fp8"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_passive_Llama-3-1-405b-fp8.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz"
action: no
- model: "Llama-3.1-405b-instruct-fp8"
endpoints:
- endpoint: "sagemaker"
image: "LMI-dist"
config: "benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json"
dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz"
action: no
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-FP8",
"jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-fp8",
"use_jumpstart_prod_artifact": true,
"image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly",
"image_uri_args": {
"framework": "djl-lmi",
"version": "nightly"
},
"model_args": {
"env": {
"HF_MODEL_ID": "/opt/ml/model/",
"OPTION_MAX_MODEL_LEN": "8192",
"OPTION_USE_PASSIVE_WORKERS": "true"
},
"enable_network_isolation": true
},
"benchmark_configurations": [
{
"instance_type": "ml.p4d.24xlarge",
"env_params": {
"TENSOR_PARALLEL_DEGREE": [
"8"
]
}
},
{
"instance_type": "ml.p5.48xlarge",
"env_params": {
"TENSOR_PARALLEL_DEGREE": [
"8"
]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
"jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-instruct-fp8",
"use_jumpstart_prod_artifact": true,
"image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly",
"image_uri_args": {
"framework": "djl-lmi",
"version": "nightly"
},
"model_args": {
"env": {
"HF_MODEL_ID": "/opt/ml/model/",
"OPTION_MAX_MODEL_LEN": "8192",
"OPTION_USE_PASSIVE_WORKERS": "true"
},
"enable_network_isolation": true
},
"benchmark_configurations": [
{
"instance_type": "ml.p5.48xlarge",
"env_params": {
"TENSOR_PARALLEL_DEGREE": [
"8"
]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"tokenizer_model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
"jumpstart_model_id": "meta-textgeneration-llama-3-1-70b-instruct",
"use_jumpstart_prod_artifact": true,
"image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly",
"image_uri_args": {
"framework": "djl-lmi",
"version": "nightly"
},
"model_args": {
"env": {
"HF_MODEL_ID": "/opt/ml/model/",
"OPTION_MAX_MODEL_LEN": "8192",
"OPTION_USE_PASSIVE_WORKERS": "true"
},
"enable_network_isolation": true
},
"benchmark_configurations": [
{
"instance_type": "ml.g5.48xlarge",
"env_params": {
"TENSOR_PARALLEL_DEGREE": [
"8"
]
}
},
{
"instance_type": "ml.g6.48xlarge",
"env_params": {
"TENSOR_PARALLEL_DEGREE": [
"8"
]
}
},
{
"instance_type": "ml.p4d.24xlarge",
"env_params": {
"TENSOR_PARALLEL_DEGREE": [
"8"
]
}
},
{
"instance_type": "ml.p5.48xlarge",
"env_params": {
"TENSOR_PARALLEL_DEGREE": [
"8"
]
}
}
]
}
Loading

0 comments on commit b9e1c7d

Please sign in to comment.