Skip to content

Commit

Permalink
#0: Update perf tests to have tighter timeouts to reduce queue time a…
Browse files Browse the repository at this point in the history
…nd time spent waiting on hangs to timeout (#8552)
  • Loading branch information
tt-rkim authored May 16, 2024
1 parent d04684f commit e36a8f5
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 18 deletions.
18 changes: 9 additions & 9 deletions .github/workflows/perf-device-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ jobs:
# so we try not to get hanging machines
fail-fast: false
matrix:
runner-info: [
{name: "GS", arch: grayskull, runs-on: ["perf-no-reset-grayskull", "self-reset"], machine-type: "bare_metal"},
{name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal"},
test-info: [
{name: "GS", arch: grayskull, runs-on: ["perf-no-reset-grayskull", "self-reset"], machine-type: "bare_metal", timeout: 40},
{name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal", timeout: 20},
]
name: "${{ matrix.runner-info.name }} device perf"
name: "${{ matrix.test-info.name }} device perf"
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.runner-info.arch }}
ARCH_NAME: ${{ matrix.test-info.arch }}
TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
LOGURU_LEVEL: INFO
environment: dev
runs-on: ${{ matrix.runner-info.runs-on }}
runs-on: ${{ matrix.test-info.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Ensure weka mount is active
Expand All @@ -39,10 +39,10 @@ jobs:
run: |
./scripts/build_scripts/build_with_profiler_opt.sh
- name: Run device performance regressions
timeout-minutes: 90
timeout-minutes: ${{ matrix.test-info.timeout }}
run: |
source build/python_env/bin/activate
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type models_device_performance_${{ matrix.runner-info.machine-type }}
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type models_device_performance_${{ matrix.test-info.machine-type }}
- name: Check device perf report exists
id: check-device-perf-report
if: ${{ !cancelled() }}
Expand All @@ -55,5 +55,5 @@ jobs:
if: ${{ !cancelled() && steps.check-device-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: device-perf-report-csv-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }}
name: device-perf-report-csv-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }}
path: "${{ steps.check-device-perf-report.outputs.device_perf_report_filename }}"
18 changes: 9 additions & 9 deletions .github/workflows/perf-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,19 @@ jobs:
# so we try not to get hanging machines
fail-fast: false
matrix:
runner-info: [
test-info: [
{name: "GS", arch: grayskull, runs-on: ["perf-grayskull", "self-reset"], machine-type: "bare_metal"},
{name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal"},
]
model-type: [llm_javelin, cnn_javelin, other]
name: "${{ matrix.model-type }} ${{ matrix.runner-info.name }}"
name: "${{ matrix.model-type }} ${{ matrix.test-info.name }}"
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.runner-info.arch }}
ARCH_NAME: ${{ matrix.test-info.arch }}
LOGURU_LEVEL: INFO
TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
environment: dev
runs-on: ${{ matrix.runner-info.runs-on }}
runs-on: ${{ matrix.test-info.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Enable Performance mode
Expand All @@ -46,16 +46,16 @@ jobs:
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.runner-info.arch }}
name: TTMetal_build_${{ matrix.test-info.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run performance regressions
id: performance_tests
timeout-minutes: 60
timeout-minutes: 30
run: |
source ${{ github.workspace }}/python_env/bin/activate
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }}
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }}
- uses: ./.github/actions/slack-report
if: ${{ steps.performance_tests.outcome != 'success' }}
with:
Expand All @@ -72,7 +72,7 @@ jobs:
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }}
name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Disable Performance mode
if: always()
Expand Down

0 comments on commit e36a8f5

Please sign in to comment.