From 851f441fc21c8b2438f393c8f3beae4a0540019e Mon Sep 17 00:00:00 2001 From: Tapasvi Patel Date: Fri, 17 May 2024 20:42:18 +0000 Subject: [PATCH] #0: Add t3k stress test pipeline --- .github/workflows/t3000-stress-tests.yaml | 55 +++++++++++++++++++ tests/scripts/run_tests.sh | 11 ++++ tests/scripts/t3000/run_t3000_stress_tests.sh | 36 ++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 .github/workflows/t3000-stress-tests.yaml create mode 100755 tests/scripts/t3000/run_t3000_stress_tests.sh diff --git a/.github/workflows/t3000-stress-tests.yaml b/.github/workflows/t3000-stress-tests.yaml new file mode 100644 index 00000000000..d1bc8133acc --- /dev/null +++ b/.github/workflows/t3000-stress-tests.yaml @@ -0,0 +1,55 @@ +name: "[T3K] T3000 stress tests" + +on: + push: + branches: + - t3000-stress-pipeline + workflow_dispatch: + schedule: + - cron: "0 */3 * * *" # This cron schedule runs the workflow every 3 hours + +jobs: + build-artifact: + uses: ./.github/workflows/build-artifact.yaml + with: + arch: '["wormhole_b0"]' + secrets: inherit + t3000-stress-tests: + needs: build-artifact + strategy: + fail-fast: false + matrix: + test-group: [ + { + name: "T3000 stress tests", + arch: wormhole_b0, + runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type stress_t3000_device --dispatch-mode ""' + }, + ] + name: ${{ matrix.test-group.name }} + env: + TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} + ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib + environment: dev + runs-on: ${{ matrix.test-group.runs-on }} + steps: + - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 + - name: Set up dynamic env vars for build + run: | + echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - uses: actions/download-artifact@v4 + with: + name: TTMetal_build_${{ matrix.test-group.arch }} + - name: Extract files + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar + - uses: ./.github/actions/install-python-deps + - name: Run stress regression tests + timeout-minutes: 120 + run: | + source ${{ github.workspace }}/python_env/bin/activate + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + ${{ matrix.test-group.cmd }} diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index fe6a4e5276a..569905b039c 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -223,6 +223,15 @@ model_perf_t3000_device() { ./tests/scripts/t3000/run_t3000_model_perf_tests.sh --pipeline-type "$pipeline_type" } + +# Run t3000 stress tests +stress_t3000_device() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + ./tests/scripts/t3000/run_t3000_stress_tests.sh +} ##########################T3000########################## ##########################TG########################## @@ -336,6 +345,8 @@ run_pipeline_tests() { demos_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == *"model_perf_t3000_device" ]]; then model_perf_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode" + elif [[ $pipeline_type == "stress_t3000_device" ]]; then + stress_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode" # TG pipelines elif [[ $pipeline_type == "unit_tg_device" ]]; then unit_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" diff --git a/tests/scripts/t3000/run_t3000_stress_tests.sh b/tests/scripts/t3000/run_t3000_stress_tests.sh new file mode 100755 index 00000000000..b0ddd981cbf --- /dev/null +++ b/tests/scripts/t3000/run_t3000_stress_tests.sh @@ -0,0 +1,36 @@ + +#/bin/bash +set -eo pipefail + +run_t3000_stress_tests() { + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" + + tt-smi-metal -r 0,1,2,3 + + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" +} + +run_t3000_tests() { + # Run stress tests + run_t3000_stress_tests +} + +main() { + if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 + fi + + # Run all tests + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + + run_t3000_tests +} + +main "$@"