From e36a8f515a4cb1f7e9ebaf5fb7b43512f7163324 Mon Sep 17 00:00:00 2001 From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com> Date: Thu, 16 May 2024 09:06:36 -0400 Subject: [PATCH] #0: Update perf tests to have tighter timeouts to reduce queue time and time spent waiting on hangs to timeout (#8552) --- .github/workflows/perf-device-models.yaml | 18 +++++++++--------- .github/workflows/perf-models.yaml | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/perf-device-models.yaml b/.github/workflows/perf-device-models.yaml index 97d87b35e56..12705a62317 100644 --- a/.github/workflows/perf-device-models.yaml +++ b/.github/workflows/perf-device-models.yaml @@ -13,18 +13,18 @@ jobs: # so we try not to get hanging machines fail-fast: false matrix: - runner-info: [ - {name: "GS", arch: grayskull, runs-on: ["perf-no-reset-grayskull", "self-reset"], machine-type: "bare_metal"}, - {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal"}, + test-info: [ + {name: "GS", arch: grayskull, runs-on: ["perf-no-reset-grayskull", "self-reset"], machine-type: "bare_metal", timeout: 40}, + {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal", timeout: 20}, ] - name: "${{ matrix.runner-info.name }} device perf" + name: "${{ matrix.test-info.name }} device perf" env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.runner-info.arch }} + ARCH_NAME: ${{ matrix.test-info.arch }} TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}' LOGURU_LEVEL: INFO environment: dev - runs-on: ${{ matrix.runner-info.runs-on }} + runs-on: ${{ matrix.test-info.runs-on }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Ensure weka mount is active @@ -39,10 +39,10 @@ jobs: run: | ./scripts/build_scripts/build_with_profiler_opt.sh - name: Run device performance regressions - timeout-minutes: 90 + timeout-minutes: ${{ matrix.test-info.timeout }} run: | source build/python_env/bin/activate - ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type models_device_performance_${{ matrix.runner-info.machine-type }} + ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type models_device_performance_${{ matrix.test-info.machine-type }} - name: Check device perf report exists id: check-device-perf-report if: ${{ !cancelled() }} @@ -55,5 +55,5 @@ jobs: if: ${{ !cancelled() && steps.check-device-perf-report.conclusion == 'success' }} uses: actions/upload-artifact@v4 with: - name: device-perf-report-csv-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }} + name: device-perf-report-csv-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }} path: "${{ steps.check-device-perf-report.outputs.device_perf_report_filename }}" diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml index 46fd0d376c1..eb1bdbfbbb7 100644 --- a/.github/workflows/perf-models.yaml +++ b/.github/workflows/perf-models.yaml @@ -17,19 +17,19 @@ jobs: # so we try not to get hanging machines fail-fast: false matrix: - runner-info: [ + test-info: [ {name: "GS", arch: grayskull, runs-on: ["perf-grayskull", "self-reset"], machine-type: "bare_metal"}, {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal"}, ] model-type: [llm_javelin, cnn_javelin, other] - name: "${{ matrix.model-type }} ${{ matrix.runner-info.name }}" + name: "${{ matrix.model-type }} ${{ matrix.test-info.name }}" env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.runner-info.arch }} + ARCH_NAME: ${{ matrix.test-info.arch }} LOGURU_LEVEL: INFO TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}' environment: dev - runs-on: ${{ matrix.runner-info.runs-on }} + runs-on: ${{ matrix.test-info.runs-on }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Enable Performance mode @@ -46,16 +46,16 @@ jobs: echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - uses: actions/download-artifact@v4 with: - name: TTMetal_build_${{ matrix.runner-info.arch }} + name: TTMetal_build_${{ matrix.test-info.arch }} - name: Extract files - run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar + run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run performance regressions id: performance_tests - timeout-minutes: 60 + timeout-minutes: 30 run: | source ${{ github.workspace }}/python_env/bin/activate - ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }} + ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }} - uses: ./.github/actions/slack-report if: ${{ steps.performance_tests.outcome != 'success' }} with: @@ -72,7 +72,7 @@ jobs: if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} uses: actions/upload-artifact@v4 with: - name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }} + name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }} path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" - name: Disable Performance mode if: always()