Skip to content

Commit

Permalink
#8558: Refactor t3000, tg and tgg pipelines, workflows and run test s…
Browse files Browse the repository at this point in the history
…cripts
  • Loading branch information
tapspatel committed May 17, 2024
1 parent 8024d15 commit 607faa1
Show file tree
Hide file tree
Showing 26 changed files with 687 additions and 349 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/run-profiler-regression.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ jobs:
{arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-1"]},
# N300
{arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-2"]},
# N300 2x4
{name: "n300-2x4", arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"]},
]
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,38 +1,35 @@
name: "Multi-chip demo tests"
name: "[T3K] T3000 demo tests"

on:
workflow_dispatch:
schedule:
- cron: '0 0 * * *' # This cron schedule runs the workflow every day at 12am UTC
- cron: '0 0 * * 6' # This cron schedule runs the workflow every Saturday at 12am UTC

jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
secrets: inherit

multi-chip-unit-tests:
t3000-demo-tests:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
test-group: [
# N300 2x4
{
name: "T3000 end to end demo tests",
name: "T3000 demo tests",
arch: wormhole_b0,
runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"],
machine-type: "bare_metal",
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type end_to_end_demos_multi_device --dispatch-mode ""'
runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type demos_t3000_device --dispatch-mode ""'
},
]
name: ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
Expand All @@ -46,9 +43,10 @@ jobs:
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run pre/post regression tests
- name: Run demo regression tests
timeout-minutes: 180
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "Nightly multi-chip tests"
name: "[T3K] T3000 frequent tests"

on:
workflow_dispatch:
Expand All @@ -11,38 +11,42 @@ jobs:
with:
arch: '["wormhole_b0"]'
secrets: inherit

multi-chip-nightly:
t3000-frequent-tests:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
runner-info: [
# N300 2x4
{name: "n300-2x4", arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"]},
test-group: [
{
name: "T3000 frequent tests",
arch: wormhole_b0,
runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_t3000_device --dispatch-mode ""'
},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.runner-info.arch }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.runner-info.runs-on }}
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.runner-info.arch }}
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent regression tests
timeout-minutes: 60
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type frequent_multi_device --dispatch-mode ""
${{ matrix.test-group.cmd }}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "Multi-Nebula model perf regressions and output report"
name: "[T3K] T3000 model perf tests"

on:
workflow_dispatch:
Expand All @@ -11,32 +11,38 @@ jobs:
with:
arch: '["wormhole_b0"]'
secrets: inherit

multi-device-models-perf:
t3000-model-perf-tests:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
runner-info: [
# N300 2x4
# NOTE: Never use arch-wormhole_b0 tags, however we're using it here because this machine is used by devs during the day
# We don't want other CI runs to interrupt dev flows. However, we need to fix this once we have more 2x4 machines dedicated to CI
{name: "n300-2x4", arch: wormhole_b0, runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], machine-type: "bare_metal"},
test-group: [
{
name: "T3000 LLM model perf tests",
model-type: "LLM",
arch: wormhole_b0,
runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_t3000_device --dispatch-mode ""'
},
{
name: "T3000 CNN model perf tests",
model-type: "CNN",
arch: wormhole_b0,
runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_t3000_device --dispatch-mode ""'
},
]
model-type: [llm_javelin, cnn_javelin, other]
name: "${{ matrix.model-type }} ${{ matrix.runner-info.arch }}"
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.runner-info.arch }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.runner-info.runs-on }}
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Enable Performance mode
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- name: Ensure weka mount is active
Expand All @@ -50,15 +56,17 @@ jobs:
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.runner-info.arch }}
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run performance regressions
- name: Run model perf regression tests
timeout-minutes: 60
run: |
source python_env/bin/activate
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }}_multi_device
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
Expand All @@ -71,9 +79,9 @@ jobs:
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }}
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Disable Performance mode
- name: Disable performance mode
if: always()
run: |
sudo cpupower frequency-set -g ondemand
41 changes: 41 additions & 0 deletions .github/workflows/t3000-profiler-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: "[T3K] T3000 profiler tests"

on:
workflow_dispatch:
workflow_call:
schedule:
- cron: "0 */8 * * *" # This cron schedule runs the workflow every 8 hours

jobs:
t3000-profiler-tests:
strategy:
fail-fast: false
matrix:
test-group: [
{
name: "T3000 profiler tests",
arch: wormhole_b0,
runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_profiler_regressions.sh'
},
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- name: Build tt-metal and libs
run: |
./scripts/build_scripts/build_with_profiler_opt.sh
- name: Run profiler regression tests
timeout-minutes: 30
run: |
./tests/scripts/run_profiler_regressions.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "Multi-chip unit tests"
name: "[T3K] T3000 unit tests"

on:
workflow_dispatch:
Expand All @@ -11,33 +11,25 @@ jobs:
with:
arch: '["wormhole_b0"]'
secrets: inherit

multi-chip-unit-tests:
t3000-unit-tests:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
test-group: [
# N300 2x4
{
name: "T3000 unit tests",
arch: wormhole_b0,
runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_multi_device --dispatch-mode ""'
runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_t3000_device --dispatch-mode ""'
},
# {
# name: "T3000 unstable tests",
# arch: wormhole_b0,
# runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"],
# cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_multi_device_unstable --dispatch-mode ""'
# },
]
name: ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
Expand All @@ -51,9 +43,10 @@ jobs:
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run pre/post regression tests
- name: Run unit regression tests
timeout-minutes: 120
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
${{ matrix.test-group.cmd }}
32 changes: 16 additions & 16 deletions .github/workflows/tg-unit-tests.yaml
Original file line number Diff line number Diff line change
@@ -1,49 +1,49 @@
name: "[TG] TG unit tests"

on:
push:
branches:
- galaxy/main
schedule:
- cron: '0 0 * * *' # Runs every day at 12am UTC
workflow_dispatch:

jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
secrets: inherit
TG-tests:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
runner-info: [
# TG
{arch: wormhole_b0, runs-on: ["config-tg", "in-service"]},
]
test-group: [
{name: "TG Unit Tests", cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_tg --dispatch-mode ""'},
{
name: "TG unit tests",
arch: wormhole_b0,
runs-on: [arch-wormhole_b0, "config-tg", "in-service", "runner-test", "bare-metal", "pipeline-functional"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_tg_device --dispatch-mode ""'
},
]
name: ${{ matrix.test-group.name }} ${{ matrix.runner-info.arch }} ${{ matrix.runner-info.name }}
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.runner-info.arch }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ${{ matrix.runner-info.runs-on }}
environment: dev
runs-on: ${{ matrix.test-group.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.runner-info.arch }}
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: ${{ matrix.test-group.name }} tests
- name: Run unit regression tests
timeout-minutes: 45
run: |
source ${{ github.workspace }}/python_env/bin/activate
Expand Down
Loading

0 comments on commit 607faa1

Please sign in to comment.