Add ir-llm benchmark workflow (#2520)

deepjavalibrary · Nov 14, 2024 · b9e1c7d · b9e1c7d
1 parent c5f1efc
commit b9e1c7d
Show file tree

Hide file tree

Showing 18 changed files with 1,234 additions and 0 deletions.
diff --git a/.github/workflows/sagemaker_llm_benchmark.yml b/.github/workflows/sagemaker_llm_benchmark.yml
@@ -0,0 +1,86 @@
+name: SageMaker LLM Benchmark
+
+on:
+   schedule:
+     - cron: '0 17 * * 5' # Run every Friday at 5pm
+
+jobs:
+  create-runners:
+    runs-on: [self-hosted, scheduler]
+    steps:
+      - name: Create new CPU instance
+        id: create_cpu1
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
+          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          --fail \
+          | jq '.token' | tr -d '"' )
+          ./start_instance.sh action_cpu $token djl-serving
+    outputs:
+      cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}
+
+  endpoint-tests:
+    runs-on: [ self-hosted, cpu ]
+    timeout-minutes: 120
+    needs: create-runners
+    strategy:
+      fail-fast: false
+      matrix:
+        engine: [lmi-dist, trtllm]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python3
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10.x'
+      - name: Install pip dependencies
+        run: pip3 install -U boto3 awscli
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
+          aws-region: us-west-2
+      - name: Install IR-LLM
+        working-directory: tests/integration/benchmark/ir-llm
+        run: |
+          aws s3 cp s3://sagemaker-python-sdk-ir-llm/sagemaker-latest.dev0-py3-none-any.whl .
+          aws s3 cp s3://sagemaker-python-sdk-ir-llm/sagemaker.normal.json .
+          aws s3 cp s3://djl-accounts/hf_token .
+          export HF_TOKEN=$(head -n1 hf_token|cut -d '=' -f2)
+          pip3 install --quiet sagemaker-latest.dev0-py3-none-any.whl
+          aws configure add-model --service-model file://sagemaker.normal.json  --service-name sagemaker
+      - name: Prepare dir to store metrics
+        working-directory: tests/integration/benchmark/ir-llm
+        run: |
+          METRICS_DIR = "metrics_${{ matrix.engine}}"
+          if [ ! -d "${METRICS_DIR}" ]; then
+            mkdir -p "${METRICS_DIR}"
+            echo "Directory ${METRICS_DIR} created."
+          else
+            echo "Directory ${METRICS_DIR} already exists."
+          fi
+      - name: Run IR-LLM
+        run: |
+          METRICS_DIR = "metrics_${{ matrix.engine}}"
+          CONFIG_DIR = "config/${{ matrix.engine}}"
+
+          python3 scripts/cw_metrics.py -j $CONFIG_DIR}/config.yml -c ${CONFIG_DIR}/config_ir_job -m  ${METRICS_DIR}
+          echo "sleep 30 seconds to allow endpoint deletion"
+          sleep 30
+
+  stop-runners:
+    if: always()
+    runs-on: [ self-hosted, scheduler ]
+    needs: [ create-runners, endpoint-tests ]
+    steps:
+      - name: Cleanup dangling SageMaker resources
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          ./cleanup_sagemaker_resources.sh sm-integration-test us-west-2
+      - name: Stop all instances
+        run: |
+          cd /home/ubuntu/djl_benchmark_script/scripts
+          instance_id=${{ needs.create-runners.outputs.cpu_instance_id1 }}
+          ./stop_instance.sh $instance_id
+         
diff --git a/tests/integration/benchmark/ir-llm/README.md b/tests/integration/benchmark/ir-llm/README.md
@@ -0,0 +1,34 @@
+# IR-LLM Benchmark on SageMaker Endpoint
+This folder contains scripts and configurations to run benchmarks on SageMaker Endpoints using the IR-LLM.
+
+
+## Usage
+Run the benchmark script:
+```
+cd tests/integration/benchmark/ir-llm
+python cw_metrics.py -j config.yml -c ./configs -m ./metrics
+```
+This command runs the benchmark script cw_metrics.py with the following arguments:
+
+-j config.yml: Specifies the main configuration file config.yml.
+-c ./configs: Specifies the directory containing the IR-LLM configuration files for each model's test case.
+-m ./metrics: Specifies the directory where the benchmark reports will be saved.
+
+## Configuration
+### config.yml
+The config.yml file defines the overall benchmark configuration, including:
+
+* cloudwatch_metrics_namespace: The CloudWatch namespace for the metrics.
+* metrics_definitions: A list of metric definitions to be collected during the benchmark.
+* benchmark_report_s3_location: The S3 location where the benchmark reports will be stored.
+* model_test_cases: A list of model test cases to be benchmarked.
+
+An example can be found at: ./config/lmi-dist/config.yml
+
+### benchmark_config_xxx.json
+The xxx.json files in the configs directory define the IR-LLM configuration for each model's test case. 
+
+An example can be found at: ./config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json
+
+## Benchmark Reports
+After running the benchmark, the reports will be saved in the specified S3 location. The reports will contain detailed metrics and performance data for each benchmarked model test case.
diff --git a/tests/integration/benchmark/ir-llm/config/lmi-dist/config.yml b/tests/integration/benchmark/ir-llm/config/lmi-dist/config.yml
@@ -0,0 +1,133 @@
+region: "us-west-2"
+
+cloudwatch:
+  metrics_namespace: "SageMaker_LLM_Benchmark"
+
+s3:
+  bucket_name: "djl-benchmark"
+  folder: "sm-lmi-dist"
+
+metrics:
+  timeToFirstToken_p50:
+    metric_name: "TTFT_P50"
+    unit: "Milliseconds"
+
+  timeToFirstToken_p99:
+    metric_name: "TTFT_P99"
+    unit: "Milliseconds"
+
+  intertokenLatency_p50:
+    metric_name: "InterTokenLatency_P50"
+    unit: "Milliseconds"
+
+  intertokenLatency_p99:
+    metric_name: "InterTokenLatency_P99"
+    unit: "Milliseconds"
+
+  costPerMillionInputTokens: 
+    metric_name: "CostPerMillionInputTokens"
+    unit: "Count"
+
+  costPerMillionOutputTokens: 
+    metric_name: "CostPerMillionOutputTokens"
+    unit: "None"
+
+  tokenizerFailed_Sum: 
+    metric_name: "TokenizerErrorRate"
+    unit: "Percent"
+
+  numberOfInputTokens_p50:
+    metric_name: "NumberOfInputTokens_p50"
+    unit: "None"
+
+  numberOfInputTokens_p99:
+    metric_name: "NumberOfInputTokens_p99"
+    unit: "None"
+
+  numberOfOutputTokens_p50:
+    metric_name: "NumberOfOutputTokens_p50"
+    unit: "None"
+
+  numberOfOutputTokens_p99:
+    metric_name: "NumberOfOutputTokens_p99"
+    unit: "None"
+
+  clientInvocationErrors_Sum:
+    metric_name: "ClientInvocationErrorRate"
+    unit: "Percent"
+
+  emptyInferenceResponse_Sum:
+    metric_name: "EmptyInferenceResponseRate"
+    unit: "Percent"
+
+benchmarks:
+  - model: "Llama-3.1-8b"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_passive_Llama-3-1-8b.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz"
+        action: yes 
+  - model: "Llama-3.1-8b-suzuka"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_LMI_V12_Llama-3-1-8b-suzuka.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-8b-instruct"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_passive_Llama-3-1-8b-instruct.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-8b-instruct-suzuka"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_LMI_V12_Llama-3-1-8b-instruct-suzuka.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-70b"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_passive_Llama-3-1-70b.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-70b-suzuka"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_LMI_V12_Llama-3-1-70b-suzuka.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-70b-instruct"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_passive_Llama-3-1-70b-instruct.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-70b-instruct-suzuka"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_LMI_V12_Llama-3-1-70b-instruct-suzuka.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-405b-fp8"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_passive_Llama-3-1-405b-fp8.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_base_sample_payload_en_500-1000.tar.gz"
+        action: no 
+  - model: "Llama-3.1-405b-instruct-fp8"
+    endpoints: 
+      - endpoint: "sagemaker"
+        image: "LMI-dist"
+        config: "benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json"
+        dataset: "s3://djl-benchmark-datasets/openorca/openorca_instruct_sample_payload_en_500-1000.tar.gz"
+        action: no 
diff --git a/...ark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json b/...ark/ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-fp8.json
@@ -0,0 +1,36 @@
+{
+    "tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-FP8",
+    "jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-fp8",
+    "use_jumpstart_prod_artifact": true,
+    "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly",
+    "image_uri_args": {
+        "framework": "djl-lmi",
+        "version": "nightly"
+    },
+    "model_args": {
+        "env": {
+            "HF_MODEL_ID": "/opt/ml/model/",
+            "OPTION_MAX_MODEL_LEN": "8192",
+            "OPTION_USE_PASSIVE_WORKERS": "true"
+        },
+        "enable_network_isolation": true
+    },
+    "benchmark_configurations": [
+        {
+            "instance_type": "ml.p4d.24xlarge",
+            "env_params": {
+                "TENSOR_PARALLEL_DEGREE": [
+                    "8"
+                ]
+            }
+        },
+        {
+            "instance_type": "ml.p5.48xlarge",
+            "env_params": {
+                "TENSOR_PARALLEL_DEGREE": [
+                    "8"
+                ]
+            }
+        }
+    ]
+}
diff --git a/...m/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json b/...m/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-405b-instruct-fp8.json
@@ -0,0 +1,28 @@
+{
+    "tokenizer_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
+    "jumpstart_model_id": "meta-textgeneration-llama-3-1-405b-instruct-fp8",
+    "use_jumpstart_prod_artifact": true,
+    "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly",
+    "image_uri_args": {
+        "framework": "djl-lmi",
+        "version": "nightly"
+    },
+    "model_args": {
+        "env": {
+            "HF_MODEL_ID": "/opt/ml/model/",
+            "OPTION_MAX_MODEL_LEN": "8192",
+            "OPTION_USE_PASSIVE_WORKERS": "true"
+        },
+        "enable_network_isolation": true
+    },
+    "benchmark_configurations": [
+        {
+            "instance_type": "ml.p5.48xlarge",
+            "env_params": {
+                "TENSOR_PARALLEL_DEGREE": [
+                    "8"
+                ]
+            }
+        }
+    ]
+}
diff --git a/...ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b-instruct.json b/...ir-llm/config/lmi-dist/config_ir_job/benchmark_config_passive_Llama-3-1-70b-instruct.json
@@ -0,0 +1,52 @@
+{
+    "tokenizer_model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+    "jumpstart_model_id": "meta-textgeneration-llama-3-1-70b-instruct",
+    "use_jumpstart_prod_artifact": true,
+    "image_uri": "125045733377.dkr.ecr.us-west-2.amazonaws.com/djl-serving:lmi-nightly",
+    "image_uri_args": {
+        "framework": "djl-lmi",
+        "version": "nightly"
+    },
+    "model_args": {
+        "env": {
+            "HF_MODEL_ID": "/opt/ml/model/",
+            "OPTION_MAX_MODEL_LEN": "8192",
+            "OPTION_USE_PASSIVE_WORKERS": "true"
+        },
+        "enable_network_isolation": true
+    },
+    "benchmark_configurations": [
+        {
+            "instance_type": "ml.g5.48xlarge",
+            "env_params": {
+                "TENSOR_PARALLEL_DEGREE": [
+                    "8"
+                ]
+            }
+        },
+        {
+            "instance_type": "ml.g6.48xlarge",
+            "env_params": {
+                "TENSOR_PARALLEL_DEGREE": [
+                    "8"
+                ]
+            }
+        },
+        {
+            "instance_type": "ml.p4d.24xlarge",
+            "env_params": {
+                "TENSOR_PARALLEL_DEGREE": [
+                    "8"
+                ]
+            }
+        },
+        {
+            "instance_type": "ml.p5.48xlarge",
+            "env_params": {
+                "TENSOR_PARALLEL_DEGREE": [
+                    "8"
+                ]
+            }
+        }
+    ]
+}