#0: Update perf tests to have tighter timeouts to reduce queue time a…

…nd time spent waiting on hangs to timeout (#8552)
tenstorrent · May 16, 2024 · e36a8f5 · e36a8f5
1 parent d04684f
commit e36a8f5
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 18 deletions.
diff --git a/.github/workflows/perf-device-models.yaml b/.github/workflows/perf-device-models.yaml
@@ -13,18 +13,18 @@ jobs:
       # so we try not to get hanging machines
       fail-fast: false
       matrix:
-        runner-info: [
-          {name: "GS", arch: grayskull, runs-on: ["perf-no-reset-grayskull", "self-reset"], machine-type: "bare_metal"},
-          {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal"},
+        test-info: [
+          {name: "GS", arch: grayskull, runs-on: ["perf-no-reset-grayskull", "self-reset"], machine-type: "bare_metal", timeout: 40},
+          {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal", timeout: 20},
         ]
-    name: "${{ matrix.runner-info.name }} device perf"
+    name: "${{ matrix.test-info.name }} device perf"
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ matrix.runner-info.arch }}
+      ARCH_NAME: ${{ matrix.test-info.arch }}
       TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
       LOGURU_LEVEL: INFO
     environment: dev
-    runs-on: ${{ matrix.runner-info.runs-on }}
+    runs-on: ${{ matrix.test-info.runs-on }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
       - name: Ensure weka mount is active
@@ -39,10 +39,10 @@ jobs:
         run: |
           ./scripts/build_scripts/build_with_profiler_opt.sh
       - name: Run device performance regressions
-        timeout-minutes: 90
+        timeout-minutes: ${{ matrix.test-info.timeout }}
         run: |
           source build/python_env/bin/activate
-          ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type models_device_performance_${{ matrix.runner-info.machine-type }}
+          ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type models_device_performance_${{ matrix.test-info.machine-type }}
       - name: Check device perf report exists
         id: check-device-perf-report
         if: ${{ !cancelled() }}
@@ -55,5 +55,5 @@ jobs:
         if: ${{ !cancelled() && steps.check-device-perf-report.conclusion == 'success' }}
         uses: actions/upload-artifact@v4
         with:
-          name: device-perf-report-csv-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }}
+          name: device-perf-report-csv-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }}
           path: "${{ steps.check-device-perf-report.outputs.device_perf_report_filename }}"
diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml
@@ -17,19 +17,19 @@ jobs:
       # so we try not to get hanging machines
       fail-fast: false
       matrix:
-        runner-info: [
+        test-info: [
           {name: "GS", arch: grayskull, runs-on: ["perf-grayskull", "self-reset"], machine-type: "bare_metal"},
           {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal"},
         ]
         model-type: [llm_javelin, cnn_javelin, other]
-    name: "${{ matrix.model-type }} ${{ matrix.runner-info.name }}"
+    name: "${{ matrix.model-type }} ${{ matrix.test-info.name }}"
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ matrix.runner-info.arch }}
+      ARCH_NAME: ${{ matrix.test-info.arch }}
       LOGURU_LEVEL: INFO
       TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
     environment: dev
-    runs-on: ${{ matrix.runner-info.runs-on }}
+    runs-on: ${{ matrix.test-info.runs-on }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
       - name: Enable Performance mode
@@ -46,16 +46,16 @@ jobs:
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.runner-info.arch }}
+          name: TTMetal_build_${{ matrix.test-info.arch }}
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+        run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run performance regressions
         id: performance_tests
-        timeout-minutes: 60
+        timeout-minutes: 30
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
-          ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }}
+          ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }}
       - uses: ./.github/actions/slack-report
         if: ${{ steps.performance_tests.outcome != 'success' }}
         with:
@@ -72,7 +72,7 @@ jobs:
         if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
         uses: actions/upload-artifact@v4
         with:
-          name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }}
+          name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }}
           path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
       - name: Disable Performance mode
         if: always()