diff --git a/.ci/gpu/reset-gcp-h100.sh b/.ci/gpu/reset-gcp-h100.sh new file mode 100644 index 0000000..9e8e063 --- /dev/null +++ b/.ci/gpu/reset-gcp-h100.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Script to tune NVIDIA H100 GPU on GCP +# To reset GPU status + +# Reset GPU and Memory clocks +sudo nvidia-smi -rgc +sudo nvidia-smi -rmc + +# Restore the default power limit (500W) +sudo nvidia-smi -pl 500 + +# Disable persistent mode +sudo nvidia-smi -pm 0 diff --git a/.ci/gpu/tune-gcp-h100.sh b/.ci/gpu/tune-gcp-h100.sh new file mode 100644 index 0000000..806348a --- /dev/null +++ b/.ci/gpu/tune-gcp-h100.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# Script to tune NVIDIA H100 GPU on GCP +# To stablize performance + +set -ex + +# Enable persistent mode +sudo nvidia-smi -pm 1 +# Lock power limit to 650W +sudo nvidia-smi -pl 650 + +# Default Memory Frequency: 2619 MHz +# Default Graphics Frequency: 1980 MHz +sudo nvidia-smi -lgc 1980,1980 +sudo nvidia-smi -lmc 2619,2619 +sudo nvidia-smi -ac 2619,1980 diff --git a/.github/workflows/_linux-benchmark-h100.yml b/.github/workflows/_linux-benchmark-h100.yml index ccfc4d6..cc7dd3c 100644 --- a/.github/workflows/_linux-benchmark-h100.yml +++ b/.github/workflows/_linux-benchmark-h100.yml @@ -36,7 +36,7 @@ jobs: submodules: recursive - name: Tune Nvidia GPU run: | - sudo nvidia-smi -pm 1 + bash .ci/gpu/tune-gcp-h100.sh sudo ldconfig nvidia-smi - name: Benchmarking @@ -52,4 +52,10 @@ jobs: run: | . "${SETUP_SCRIPT}" latest_result_json=$(find ./benchmark-output/ -name "result.json" | sort -r | head -n 1) - python .ci/upload/scribe.py --json ${latest_result_json} + python ./.ci/upload/scribe.py --json ${latest_result_json} + - name: Restore Nvidia GPU + if: always() + run: | + bash .ci/gpu/reset-gcp-h100.sh + sudo ldconfig + nvidia-smi diff --git a/benchmarks/nightly/run.py b/benchmarks/nightly/run.py index 2cae1df..5525296 100644 --- a/benchmarks/nightly/run.py +++ b/benchmarks/nightly/run.py @@ -46,6 +46,7 @@ def setup_tritonbench_cwd(): "latency,gbps", "--num-inputs", "6", + "--cudagraph", ], "bf16_gemm": [ "--op", @@ -58,6 +59,7 @@ def setup_tritonbench_cwd(): "latency,tflops", "--num-inputs", "4", + "--cudagraph", ], }