From 607faa1b2240281136f2eb719a3ae347fabf6e41 Mon Sep 17 00:00:00 2001 From: Tapasvi Patel Date: Thu, 16 May 2024 19:13:33 +0000 Subject: [PATCH] #8558: Refactor t3000, tg and tgg pipelines, workflows and run test scripts --- .../workflows/run-profiler-regression.yaml | 2 - ...o-end-demos.yaml => t3000-demo-tests.yaml} | 24 ++- ...requent.yaml => t3000-frequent-tests.yaml} | 30 ++-- ...odels.yaml => t3000-model-perf-tests.yaml} | 54 ++++--- .github/workflows/t3000-profiler-tests.yaml | 41 +++++ ...-unit-tests.yaml => t3000-unit-tests.yaml} | 25 ++-- .github/workflows/tg-unit-tests.yaml | 32 ++-- .github/workflows/tgg-unit-tests.yaml | 28 ++-- CODEOWNERS | 9 +- .../demos/falcon7b/tests/test_perf_falcon.py | 2 +- .../mixtral8x7b/tests/test_mixtral_perf.py | 2 +- pytest.ini | 2 +- .../multi_chip/run_end_to_end_demos.sh | 23 --- .../run_frequent_regressions_multi_device.sh | 35 ----- ...re_post_commit_regressions_multi_device.sh | 49 ------ .../multi_chip/run_unstable_multi_device.sh | 13 -- tests/scripts/run_performance.sh | 29 ---- tests/scripts/run_tests.sh | 141 ++++++++++-------- tests/scripts/t3000/run_t3000_demo_tests.sh | 46 ++++++ .../scripts/t3000/run_t3000_frequent_tests.sh | 121 +++++++++++++++ .../t3000/run_t3000_model_perf_tests.sh | 111 ++++++++++++++ tests/scripts/t3000/run_t3000_unit_tests.sh | 127 ++++++++++++++++ .../tg/run_pre_post_commit_regressions_tg.sh | 17 --- tests/scripts/tg/run_tg_unit_tests.sh | 28 ++++ .../run_pre_post_commit_regressions_tgg.sh | 17 --- tests/scripts/tgg/run_tgg_unit_tests.sh | 28 ++++ 26 files changed, 687 insertions(+), 349 deletions(-) rename .github/workflows/{multi-device-end-to-end-demos.yaml => t3000-demo-tests.yaml} (65%) rename .github/workflows/{multi-device-build-and-unit-tests-frequent.yaml => t3000-frequent-tests.yaml} (55%) rename .github/workflows/{multi-device-perf-models.yaml => t3000-model-perf-tests.yaml} (53%) create mode 100644 .github/workflows/t3000-profiler-tests.yaml rename .github/workflows/{multi-device-build-and-unit-tests.yaml => t3000-unit-tests.yaml} (62%) delete mode 100755 tests/scripts/multi_chip/run_end_to_end_demos.sh delete mode 100755 tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh delete mode 100755 tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh delete mode 100644 tests/scripts/multi_chip/run_unstable_multi_device.sh create mode 100755 tests/scripts/t3000/run_t3000_demo_tests.sh create mode 100755 tests/scripts/t3000/run_t3000_frequent_tests.sh create mode 100755 tests/scripts/t3000/run_t3000_model_perf_tests.sh create mode 100755 tests/scripts/t3000/run_t3000_unit_tests.sh delete mode 100755 tests/scripts/tg/run_pre_post_commit_regressions_tg.sh create mode 100755 tests/scripts/tg/run_tg_unit_tests.sh delete mode 100755 tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh create mode 100755 tests/scripts/tgg/run_tgg_unit_tests.sh diff --git a/.github/workflows/run-profiler-regression.yaml b/.github/workflows/run-profiler-regression.yaml index f975e54429e..fbedb2b0d1f 100644 --- a/.github/workflows/run-profiler-regression.yaml +++ b/.github/workflows/run-profiler-regression.yaml @@ -18,8 +18,6 @@ jobs: {arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-1"]}, # N300 {arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-1", "multi-chip-num-chips-2"]}, - # N300 2x4 - {name: "n300-2x4", arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"]}, ] env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} diff --git a/.github/workflows/multi-device-end-to-end-demos.yaml b/.github/workflows/t3000-demo-tests.yaml similarity index 65% rename from .github/workflows/multi-device-end-to-end-demos.yaml rename to .github/workflows/t3000-demo-tests.yaml index 11be7baeb77..393b7859ecc 100644 --- a/.github/workflows/multi-device-end-to-end-demos.yaml +++ b/.github/workflows/t3000-demo-tests.yaml @@ -1,9 +1,9 @@ -name: "Multi-chip demo tests" +name: "[T3K] T3000 demo tests" on: workflow_dispatch: schedule: - - cron: '0 0 * * *' # This cron schedule runs the workflow every day at 12am UTC + - cron: '0 0 * * 6' # This cron schedule runs the workflow every Saturday at 12am UTC jobs: build-artifact: @@ -11,28 +11,25 @@ jobs: with: arch: '["wormhole_b0"]' secrets: inherit - - multi-chip-unit-tests: + t3000-demo-tests: needs: build-artifact strategy: - # Do not fail-fast because we need to ensure all tests go to completion - # so we try not to get hanging machines fail-fast: false matrix: test-group: [ - # N300 2x4 { - name: "T3000 end to end demo tests", + name: "T3000 demo tests", arch: wormhole_b0, - runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], - machine-type: "bare_metal", - cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type end_to_end_demos_multi_device --dispatch-mode ""' + runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type demos_t3000_device --dispatch-mode ""' }, ] - name: ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }} + name: ${{ matrix.test-group.name }} env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib environment: dev runs-on: ${{ matrix.test-group.runs-on }} steps: @@ -46,9 +43,10 @@ jobs: - name: Extract files run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - - name: Run pre/post regression tests + - name: Run demo regression tests timeout-minutes: 180 run: | source ${{ github.workspace }}/python_env/bin/activate + cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME ${{ matrix.test-group.cmd }} diff --git a/.github/workflows/multi-device-build-and-unit-tests-frequent.yaml b/.github/workflows/t3000-frequent-tests.yaml similarity index 55% rename from .github/workflows/multi-device-build-and-unit-tests-frequent.yaml rename to .github/workflows/t3000-frequent-tests.yaml index b108d0f2f67..f7b89b65988 100644 --- a/.github/workflows/multi-device-build-and-unit-tests-frequent.yaml +++ b/.github/workflows/t3000-frequent-tests.yaml @@ -1,4 +1,4 @@ -name: "Nightly multi-chip tests" +name: "[T3K] T3000 frequent tests" on: workflow_dispatch: @@ -11,23 +11,27 @@ jobs: with: arch: '["wormhole_b0"]' secrets: inherit - - multi-chip-nightly: + t3000-frequent-tests: needs: build-artifact strategy: - # Do not fail-fast because we need to ensure all tests go to completion - # so we try not to get hanging machines fail-fast: false matrix: - runner-info: [ - # N300 2x4 - {name: "n300-2x4", arch: wormhole_b0, runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"]}, + test-group: [ + { + name: "T3000 frequent tests", + arch: wormhole_b0, + runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_t3000_device --dispatch-mode ""' + }, ] + name: ${{ matrix.test-group.name }} env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.runner-info.arch }} + ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib environment: dev - runs-on: ${{ matrix.runner-info.runs-on }} + runs-on: ${{ matrix.test-group.runs-on }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Set up dynamic env vars for build @@ -35,9 +39,9 @@ jobs: echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - uses: actions/download-artifact@v4 with: - name: TTMetal_build_${{ matrix.runner-info.arch }} + name: TTMetal_build_${{ matrix.test-group.arch }} - name: Extract files - run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run frequent regression tests timeout-minutes: 60 @@ -45,4 +49,4 @@ jobs: source ${{ github.workspace }}/python_env/bin/activate cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME - ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type frequent_multi_device --dispatch-mode "" + ${{ matrix.test-group.cmd }} diff --git a/.github/workflows/multi-device-perf-models.yaml b/.github/workflows/t3000-model-perf-tests.yaml similarity index 53% rename from .github/workflows/multi-device-perf-models.yaml rename to .github/workflows/t3000-model-perf-tests.yaml index 4a43bf1e438..683158cbc62 100644 --- a/.github/workflows/multi-device-perf-models.yaml +++ b/.github/workflows/t3000-model-perf-tests.yaml @@ -1,4 +1,4 @@ -name: "Multi-Nebula model perf regressions and output report" +name: "[T3K] T3000 model perf tests" on: workflow_dispatch: @@ -11,32 +11,38 @@ jobs: with: arch: '["wormhole_b0"]' secrets: inherit - - multi-device-models-perf: + t3000-model-perf-tests: needs: build-artifact strategy: - # Do not fail-fast because we need to ensure all tests go to completion - # so we try not to get hanging machines fail-fast: false matrix: - runner-info: [ - # N300 2x4 - # NOTE: Never use arch-wormhole_b0 tags, however we're using it here because this machine is used by devs during the day - # We don't want other CI runs to interrupt dev flows. However, we need to fix this once we have more 2x4 machines dedicated to CI - {name: "n300-2x4", arch: wormhole_b0, runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], machine-type: "bare_metal"}, + test-group: [ + { + name: "T3000 LLM model perf tests", + model-type: "LLM", + arch: wormhole_b0, + runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type llm_model_perf_t3000_device --dispatch-mode ""' + }, + { + name: "T3000 CNN model perf tests", + model-type: "CNN", + arch: wormhole_b0, + runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_t3000_device --dispatch-mode ""' + }, ] - model-type: [llm_javelin, cnn_javelin, other] - name: "${{ matrix.model-type }} ${{ matrix.runner-info.arch }}" + name: ${{ matrix.test-group.name }} env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.runner-info.arch }} + ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO - TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}' + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib environment: dev - runs-on: ${{ matrix.runner-info.runs-on }} + runs-on: ${{ matrix.test-group.runs-on }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - - name: Enable Performance mode + - name: Enable performance mode run: | sudo cpupower frequency-set -g performance - name: Ensure weka mount is active @@ -50,15 +56,17 @@ jobs: echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - uses: actions/download-artifact@v4 with: - name: TTMetal_build_${{ matrix.runner-info.arch }} + name: TTMetal_build_${{ matrix.test-group.arch }} - name: Extract files - run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - - name: Run performance regressions + - name: Run model perf regression tests timeout-minutes: 60 run: | - source python_env/bin/activate - ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }}_multi_device + source ${{ github.workspace }}/python_env/bin/activate + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + ${{ matrix.test-group.cmd }} - name: Check perf report exists id: check-perf-report if: ${{ !cancelled() }} @@ -71,9 +79,9 @@ jobs: if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} uses: actions/upload-artifact@v4 with: - name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }} + name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }} path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" - - name: Disable Performance mode + - name: Disable performance mode if: always() run: | sudo cpupower frequency-set -g ondemand diff --git a/.github/workflows/t3000-profiler-tests.yaml b/.github/workflows/t3000-profiler-tests.yaml new file mode 100644 index 00000000000..99942f93314 --- /dev/null +++ b/.github/workflows/t3000-profiler-tests.yaml @@ -0,0 +1,41 @@ +name: "[T3K] T3000 profiler tests" + +on: + workflow_dispatch: + workflow_call: + schedule: + - cron: "0 */8 * * *" # This cron schedule runs the workflow every 8 hours + +jobs: + t3000-profiler-tests: + strategy: + fail-fast: false + matrix: + test-group: [ + { + name: "T3000 profiler tests", + arch: wormhole_b0, + runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-perf"], + cmd: './tests/scripts/run_profiler_regressions.sh' + }, + ] + name: ${{ matrix.test-group.name }} + env: + TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} + ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib + environment: dev + runs-on: ${{ matrix.test-group.runs-on }} + steps: + - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 + - name: Set up dynamic env vars for build + run: | + echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - name: Build tt-metal and libs + run: | + ./scripts/build_scripts/build_with_profiler_opt.sh + - name: Run profiler regression tests + timeout-minutes: 30 + run: | + ./tests/scripts/run_profiler_regressions.sh diff --git a/.github/workflows/multi-device-build-and-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml similarity index 62% rename from .github/workflows/multi-device-build-and-unit-tests.yaml rename to .github/workflows/t3000-unit-tests.yaml index 6c346077d98..935f4c93750 100644 --- a/.github/workflows/multi-device-build-and-unit-tests.yaml +++ b/.github/workflows/t3000-unit-tests.yaml @@ -1,4 +1,4 @@ -name: "Multi-chip unit tests" +name: "[T3K] T3000 unit tests" on: workflow_dispatch: @@ -11,33 +11,25 @@ jobs: with: arch: '["wormhole_b0"]' secrets: inherit - - multi-chip-unit-tests: + t3000-unit-tests: needs: build-artifact strategy: - # Do not fail-fast because we need to ensure all tests go to completion - # so we try not to get hanging machines fail-fast: false matrix: test-group: [ - # N300 2x4 { name: "T3000 unit tests", arch: wormhole_b0, - runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], - cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_multi_device --dispatch-mode ""' + runs-on: [arch-wormhole_b0, "config-t3000", "in-service", "runner-test", "bare-metal", "pipeline-functional"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_t3000_device --dispatch-mode ""' }, - # { - # name: "T3000 unstable tests", - # arch: wormhole_b0, - # runs-on: ["wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], - # cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_multi_device_unstable --dispatch-mode ""' - # }, ] - name: ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }} + name: ${{ matrix.test-group.name }} env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} ARCH_NAME: ${{ matrix.test-group.arch }} + LOGURU_LEVEL: INFO + LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib environment: dev runs-on: ${{ matrix.test-group.runs-on }} steps: @@ -51,9 +43,10 @@ jobs: - name: Extract files run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - - name: Run pre/post regression tests + - name: Run unit regression tests timeout-minutes: 120 run: | source ${{ github.workspace }}/python_env/bin/activate + cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME ${{ matrix.test-group.cmd }} diff --git a/.github/workflows/tg-unit-tests.yaml b/.github/workflows/tg-unit-tests.yaml index 89dcec33fd9..12163a65d0d 100644 --- a/.github/workflows/tg-unit-tests.yaml +++ b/.github/workflows/tg-unit-tests.yaml @@ -1,9 +1,6 @@ name: "[TG] TG unit tests" on: - push: - branches: - - galaxy/main schedule: - cron: '0 0 * * *' # Runs every day at 12am UTC workflow_dispatch: @@ -11,27 +8,30 @@ on: jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml + with: + arch: '["wormhole_b0"]' secrets: inherit TG-tests: + needs: build-artifact strategy: - # Do not fail-fast because we need to ensure all tests go to completion - # so we try not to get hanging machines fail-fast: false matrix: - runner-info: [ - # TG - {arch: wormhole_b0, runs-on: ["config-tg", "in-service"]}, - ] test-group: [ - {name: "TG Unit Tests", cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_tg --dispatch-mode ""'}, + { + name: "TG unit tests", + arch: wormhole_b0, + runs-on: [arch-wormhole_b0, "config-tg", "in-service", "runner-test", "bare-metal", "pipeline-functional"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_tg_device --dispatch-mode ""' + }, ] - name: ${{ matrix.test-group.name }} ${{ matrix.runner-info.arch }} ${{ matrix.runner-info.name }} + name: ${{ matrix.test-group.name }} env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.runner-info.arch }} + ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ${{ matrix.runner-info.runs-on }} + environment: dev + runs-on: ${{ matrix.test-group.runs-on }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Set up dynamic env vars for build @@ -39,11 +39,11 @@ jobs: echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - uses: actions/download-artifact@v4 with: - name: TTMetal_build_${{ matrix.runner-info.arch }} + name: TTMetal_build_${{ matrix.test-group.arch }} - name: Extract files - run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - - name: ${{ matrix.test-group.name }} tests + - name: Run unit regression tests timeout-minutes: 45 run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml index eb3d6c23d18..5351b9ae824 100644 --- a/.github/workflows/tgg-unit-tests.yaml +++ b/.github/workflows/tgg-unit-tests.yaml @@ -8,27 +8,29 @@ on: jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml + with: + arch: '["wormhole_b0"]' secrets: inherit TGG-tests: + needs: build-artifact strategy: - # Do not fail-fast because we need to ensure all tests go to completion - # so we try not to get hanging machines fail-fast: false matrix: - runner-info: [ - # TGG - {arch: wormhole_b0, runs-on: ["config-tgg", "in-service"]}, - ] test-group: [ - {name: "TGG Unit Tests", cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type post_commit_tgg --dispatch-mode ""'}, + { + name: "TGG unit tests", + arch: wormhole_b0, + runs-on: [arch-wormhole_b0, "config-tgg", "in-service", "runner-test", "bare-metal", "pipeline-functional"], + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type unit_tgg_device --dispatch-mode ""' + }, ] - name: ${{ matrix.test-group.name }} ${{ matrix.runner-info.arch }} ${{ matrix.runner-info.name }} + name: ${{ matrix.test-group.name }} env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ matrix.runner-info.arch }} + ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ${{ matrix.runner-info.runs-on }} + runs-on: ${{ matrix.test-group.runs-on }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Set up dynamic env vars for build @@ -36,11 +38,11 @@ jobs: echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - uses: actions/download-artifact@v4 with: - name: TTMetal_build_${{ matrix.runner-info.arch }} + name: TTMetal_build_${{ matrix.test-group.arch }} - name: Extract files - run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar + run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - - name: ${{ matrix.test-group.name }} tests + - name: Run unit regression tests timeout-minutes: 45 run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/CODEOWNERS b/CODEOWNERS index 4864b93c26a..147b0e9a246 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,6 +2,13 @@ # precedence. .github/ @tt-rkim +.github/t3000-unit-tests.yaml @tapspatel +.github/t3000-profiler-tests.yaml @tapspatel +.github/t3000-model-perf-tests.yaml @tapspatel +.github/t3000-frequent-tests.yaml @tapspatel +.github/t3000-demo-tests.yaml @tapspatel +.github/tg-unit-tests.yaml @tapspatel +.github/tgg-unit-tests.yaml @tapspatel /infra/ @tt-rkim @@ -39,7 +46,7 @@ tests/scripts/run_pre_post_commit_regressions_multi_device.sh @tt-rkim @aliuTT @ tests/scripts/run_pre_post_commit_regressions_fast_dispatch.sh @tt-rkim @TT-billteng @ttmchiou tests/scripts/run_models.sh @tt-rkim tests/scripts/nightly/ @tt-rkim @vtangTT -tests/scripts/multi_chip/ @tapspatel +tests/scripts/t3000/ @tapspatel tests/scripts/tg/ @tapspatel tests/scripts/tgg/ @tapspatel diff --git a/models/demos/falcon7b/tests/test_perf_falcon.py b/models/demos/falcon7b/tests/test_perf_falcon.py index 7e036c3b239..0765ed99412 100644 --- a/models/demos/falcon7b/tests/test_perf_falcon.py +++ b/models/demos/falcon7b/tests/test_perf_falcon.py @@ -584,7 +584,7 @@ def test_perf_wh_bare_metal( async_mode, ) - @pytest.mark.models_performance_bare_metal_multi_device + @pytest.mark.model_perf_t3000 @pytest.mark.parametrize( "llm_mode, num_devices, num_layers, batch, seq_len, kv_cache_len, model_config_str, expected_output_pcc, expected_k_cache_pcc, expected_v_cache_pcc, expected_inference_time, async_mode", ( diff --git a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py index 2fc1a67e180..77cf593e60a 100644 --- a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py +++ b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py @@ -36,7 +36,7 @@ def forward(self, x): return self.emb(x) -@pytest.mark.models_performance_bare_metal_multi_device +@pytest.mark.model_perf_t3000 @pytest.mark.parametrize( "generation_start_pos, expected_compile_time, expected_inference_time", ( diff --git a/pytest.ini b/pytest.ini index 10ee11deb4c..b1f2bdbc22c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -13,4 +13,4 @@ markers = models_performance_bare_metal: mark model silicon tests for performance on bare metal models_performance_virtual_machine: mark model silicon tests for performance on virtual_machine models_device_performance_bare_metal: mark model silicon tests for device performance on bare metal - models_performance_bare_metal_multi_device: mark model silicon tests for performance on multi-chip bare metal + model_perf_t3000: mark model silicon tests for performance on t3000 bare metal diff --git a/tests/scripts/multi_chip/run_end_to_end_demos.sh b/tests/scripts/multi_chip/run_end_to_end_demos.sh deleted file mode 100755 index f15ee8d9256..00000000000 --- a/tests/scripts/multi_chip/run_end_to_end_demos.sh +++ /dev/null @@ -1,23 +0,0 @@ - -#/bin/bash - -set -eo pipefail - -if [[ -z "$TT_METAL_HOME" ]]; then - echo "Must provide TT_METAL_HOME in environment" 1>&2 - exit 1 -fi - -if [[ -z "$ARCH_NAME" ]]; then - echo "Must provide ARCH_NAME in environment" 1>&2 - exit 1 -fi - -cd $TT_METAL_HOME -export PYTHONPATH=$TT_METAL_HOME - -# Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py - -# Falcon40B end to end demo (prefill + decode) -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_5_loops.py diff --git a/tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh b/tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh deleted file mode 100755 index ac7e4fd1f12..00000000000 --- a/tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh +++ /dev/null @@ -1,35 +0,0 @@ - -#/bin/bash - -set -eo pipefail - -if [[ -z "$TT_METAL_HOME" ]]; then - echo "Must provide TT_METAL_HOME in environment" 1>&2 - exit 1 -fi - -if [[ -z "$ARCH_NAME" ]]; then - echo "Must provide ARCH_NAME in environment" 1>&2 - exit 1 -fi - -cd $TT_METAL_HOME -export PYTHONPATH=$TT_METAL_HOME - -pytest tests/ttnn/unit_tests/test_multi_device.py - -pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py -pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py - -# Llama2_70b related cached files and tests (the test should parse env variables similar to these) -export LLAMA_CKPT_DIR=/mnt/MLPerf/tt_dnn-models/llama-2/llama-2-70b-repacked/ -export LLAMA_TOKENIZER_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/tokenizer.model -export LLAMA_CACHE_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/llama-data-cache/weights-cache-2 - -pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py -pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py -pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py -pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py - -# Mistral8x7b 8 chip decode model test (env flags set inside the test) -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[10-1-pcc] diff --git a/tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh b/tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh deleted file mode 100755 index a2081e36d58..00000000000 --- a/tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh +++ /dev/null @@ -1,49 +0,0 @@ - -#/bin/bash - -set -eo pipefail - -if [[ -z "$TT_METAL_HOME" ]]; then - echo "Must provide TT_METAL_HOME in environment" 1>&2 - exit 1 -fi - -if [[ -z "$ARCH_NAME" ]]; then - echo "Must provide ARCH_NAME in environment" 1>&2 - exit 1 -fi - -TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" -TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" -TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" -TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" - -TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" -./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" -./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" -pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit - -# ttnn multi-chip apis unit tests -pytest tests/ttnn/unit_tests/test_multi_device.py - -# Falcon40b unit tests; prefill required 8x8 grids -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py -WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py - -# Mistral8x7b 8 chip decode tests (env flags set inside the tests) -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py -pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[1-1-pcc] - -# Falcon7B data parallel tests -pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py -pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py -pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py -pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py diff --git a/tests/scripts/multi_chip/run_unstable_multi_device.sh b/tests/scripts/multi_chip/run_unstable_multi_device.sh deleted file mode 100644 index bd08c570b9b..00000000000 --- a/tests/scripts/multi_chip/run_unstable_multi_device.sh +++ /dev/null @@ -1,13 +0,0 @@ -#/bin/bash - -set -eo pipefail - -if [[ -z "$TT_METAL_HOME" ]]; then - echo "Must provide TT_METAL_HOME in environment" 1>&2 - exit 1 -fi - -if [[ -z "$ARCH_NAME" ]]; then - echo "Must provide ARCH_NAME in environment" 1>&2 - exit 1 -fi diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 92b6e086ded..0b0a0692c96 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -43,19 +43,6 @@ run_perf_models_llm_javelin() { env python models/perf/merge_perf_results.py } -run_perf_models_llm_javelin_multi_device() { - local tt_arch=$1 - local test_marker=$2 - - env pytest models/demos/falcon7b/tests -m $test_marker - - # Mistral8x7b env flags are set inside the tests - env pytest models/demos/t3000/mixtral8x7b/tests -m $test_marker - - ## Merge all the generated reports - env python models/perf/merge_perf_results.py -} - run_perf_models_cnn_javelin() { local tt_arch=$1 local test_marker=$2 @@ -70,16 +57,6 @@ run_perf_models_cnn_javelin() { env python models/perf/merge_perf_results.py } -run_perf_models_cnn_javelin_multi_device() { - local tt_arch=$1 - local test_marker=$2 - - # Add tests here - - ## Merge all the generated reports - env python models/perf/merge_perf_results.py -} - run_device_perf_models() { local test_marker=$1 @@ -153,8 +130,6 @@ main() { test_marker="models_performance_virtual_machine" elif [[ "$pipeline_type" == *"device_performance_bare_metal"* ]]; then test_marker="models_device_performance_bare_metal" - elif [[ "$pipeline_type" == *"_bare_metal_multi_device"* ]]; then - test_marker="models_performance_bare_metal_multi_device" elif [[ "$pipeline_type" == *"_bare_metal"* ]]; then test_marker="models_performance_bare_metal" else @@ -165,12 +140,8 @@ main() { if [[ "$pipeline_type" == *"device_performance"* ]]; then run_device_perf_models "$test_marker" run_device_perf_ops "$test_marker" - elif [[ "$pipeline_type" == "llm_javelin_models_performance_bare_metal_multi_device" ]]; then - run_perf_models_llm_javelin_multi_device "$tt_arch" "$test_marker" elif [[ "$pipeline_type" == "llm_javelin_models_performance"* ]]; then run_perf_models_llm_javelin "$tt_arch" "$test_marker" - elif [[ "$pipeline_type" == "cnn_javelin_models_performance_bare_metal_multi_device" ]]; then - run_perf_models_cnn_javelin_multi_device "$tt_arch" "$test_marker" elif [[ "$pipeline_type" == "cnn_javelin_models_performance"* ]]; then run_perf_models_cnn_javelin "$tt_arch" "$test_marker" elif [[ "$pipeline_type" == *"other_models_performance"* ]]; then diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 818cf3d6327..37580d88309 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -102,42 +102,6 @@ run_frequent_api_pipeline_tests() { fi } -# Run frequent multi device pipeline tests - these are the t3000 + 4xn300 tests -run_frequent_multi_device_pipeline_tests() { - local tt_arch=$1 - local pipeline_type=$2 - local dispatch_mode=$3 - - ./tests/scripts/multi_chip/run_frequent_regressions_multi_device.sh -} - -# Run end to end demos - these are the t3000 + 4xn300 tests -run_end_to_end_demos_multi_device() { - local tt_arch=$1 - local pipeline_type=$2 - local dispatch_mode=$3 - - ./tests/scripts/multi_chip/run_end_to_end_demos.sh -} - -# Run post commit TG tests - these are 4xn150 + galaxy tests -run_post_commit_tg_pipeline_tests() { - local tt_arch=$1 - local pipeline_type=$2 - local dispatch_mode=$3 - - ./tests/scripts/tg/run_pre_post_commit_regressions_tg.sh -} - -# Run post commit TGG tests - these are 8xn150 + 2xgalaxy tests -run_post_commit_tgg_pipeline_tests() { - local tt_arch=$1 - local pipeline_type=$2 - local dispatch_mode=$3 - - ./tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh -} - run_models_performance() { local tt_arch=$1 local pipeline_type=$2 @@ -160,14 +124,6 @@ run_models_performance_bare_metal_pipeline_tests() { run_models_performance "$tt_arch" "$pipeline_type" } -run_models_performance_bare_metal_multi_device_pipeline_tests() { - local tt_arch=$1 - local pipeline_type=$2 - local dispatch_mode=$3 - - run_models_performance_multi_device "$tt_arch" "$pipeline_type" -} - run_models_performance_virtual_machine_pipeline_tests() { local tt_arch=$1 local pipeline_type=$2 @@ -203,14 +159,6 @@ run_stress_post_commit_pipeline_tests() { done } -run_post_commit_multi_device_pipeline_tests() { - local tt_arch=$1 - local pipeline_type=$2 - local dispatch_mode=$3 - - ./tests/scripts/multi_chip/run_pre_post_commit_regressions_multi_device.sh -} - run_post_commit_multi_device_unstable_pipeline_tests() { local tt_arch=$1 local pipeline_type=$2 @@ -239,6 +187,66 @@ run_ttnn_sweeps_pipeline_tests() { ./tests/scripts/run_ttnn_sweeps.sh } +##########################T3000########################## +# Run t3000 unit tests +unit_t3000_device() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + ./tests/scripts/t3000/run_t3000_unit_tests.sh +} + +# Run t3000 frequent tests +frequent_t3000_device() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + ./tests/scripts/t3000/run_t3000_frequent_tests.sh +} + +# Run t3000 demo tests +demos_t3000_device() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + ./tests/scripts/t3000/run_t3000_demo_tests.sh +} + +# Run t3000 model perf tests +model_perf_t3000_device() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + ./tests/scripts/t3000/run_t3000_model_perf_tests.sh --pipeline-type "$pipeline_type" +} +##########################T3000########################## + +##########################TG########################## +# Run TG unit tests +unit_tg_device() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + ./tests/scripts/tg/run_tg_unit_tests.sh +} +##########################TG########################## + +##########################TGG########################## +# Run TGG unit tests +unit_tgg_device() { + local tt_arch=$1 + local pipeline_type=$2 + local dispatch_mode=$3 + + ./tests/scripts/tgg/run_tgg_unit_tests.sh +} +##########################TGG########################## + run_pipeline_tests() { local tt_arch=$1 local pipeline_type=$2 @@ -257,28 +265,29 @@ run_pipeline_tests() { run_eager_package_end_to_end_pipeline_tests "$tt_arch" "$pipeline_type" elif [[ $pipeline_type == *"models_performance_bare_metal" || $pipeline_type == "models_device_performance_bare_metal" ]]; then run_models_performance_bare_metal_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == *"models_performance_bare_metal_multi_device" ]]; then - run_models_performance_bare_metal_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == "models_performance_virtual_machine" ]]; then run_models_performance_virtual_machine_pipeline_tests "$tt_arch" "$pipeline_type" elif [[ $pipeline_type == "stress_post_commit" ]]; then run_stress_post_commit_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == "post_commit_multi_device" ]]; then - run_post_commit_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == "post_commit_multi_device_unstable" ]]; then - run_post_commit_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == "frequent_multi_device" ]]; then - run_frequent_multi_device_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == "end_to_end_demos_multi_device" ]]; then - run_end_to_end_demos_multi_device "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == "post_commit_tg" ]]; then - run_post_commit_tg_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == "post_commit_tgg" ]]; then - run_post_commit_tgg_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == "microbenchmarks" ]]; then run_microbenchmarks_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == "ttnn_sweeps" ]]; then run_ttnn_sweeps_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" + # T3000 pipelines + elif [[ $pipeline_type == "unit_t3000_device" ]]; then + unit_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode" + elif [[ $pipeline_type == "frequent_t3000_device" ]]; then + frequent_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode" + elif [[ $pipeline_type == "demos_t3000_device" ]]; then + demos_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode" + elif [[ $pipeline_type == *"model_perf_t3000_device" ]]; then + model_perf_t3000_device "$tt_arch" "$pipeline_type" "$dispatch_mode" + # TG pipelines + elif [[ $pipeline_type == "unit_tg_device" ]]; then + unit_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" + # TGG pipelines + elif [[ $pipeline_type == "unit_tgg_device" ]]; then + unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" else echo "Unknown pipeline: $pipeline_type" exit 1 diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh new file mode 100755 index 00000000000..c7b7a2ad24f --- /dev/null +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -0,0 +1,46 @@ + +#/bin/bash +set -eo pipefail + +run_t3000_falcon40b_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_falcon40b_tests" + + # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py + + # Falcon40B end to end demo (prefill + decode) + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_5_loops.py + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_ethernet_tests $duration seconds to complete" +} + +run_t3000_tests() { + # Run falcon40b tests + run_t3000_falcon40b_tests +} + +main() { + if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 + fi + + # Run all tests + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + + run_t3000_tests +} + +main "$@" diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh new file mode 100755 index 00000000000..719bda685d9 --- /dev/null +++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh @@ -0,0 +1,121 @@ + +#/bin/bash +set -eo pipefail + +run_t3000_ethernet_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_ethernet_tests" + + pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py + pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_ethernet_tests $duration seconds to complete" +} + +run_t3000_llama2_70b_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_llama2_70b_tests" + + # Llama2_70b related cached files and tests (the test should parse env variables similar to these) + export LLAMA_CKPT_DIR=/mnt/MLPerf/tt_dnn-models/llama-2/llama-2-70b-repacked/ + export LLAMA_TOKENIZER_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/tokenizer.model + export LLAMA_CACHE_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/llama-data-cache/weights-cache-2 + + pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py + pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py + pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py + pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete" +} + +run_t3000_mixtral_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_mixtral_tests" + + # mixtral8x7b 8 chip decode model test (env flags set inside the test) + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[10-1-pcc] + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" +} + +run_t3000_tteager_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_tteager_tests" + + pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_tteager_tests $duration seconds to complete" +} + +run_t3000_falcon40b_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_falcon40b_tests" + + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" +} + +run_t3000_tests() { + # Run ethernet tests + #run_t3000_ethernet_tests + + # Run tteager tests + #run_t3000_tteager_tests + + # Run llama2-70b tests + run_t3000_llama2_70b_tests + + # Run mixtral tests + run_t3000_mixtral_tests + + # Run falcon40b tests + run_t3000_falcon40b_tests +} + +main() { + if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 + fi + + # Run all tests + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + + run_t3000_tests +} + +main "$@" diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh new file mode 100755 index 00000000000..5e26d9c7de2 --- /dev/null +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -0,0 +1,111 @@ + +#/bin/bash +set -eo pipefail + +run_t3000_falcon7b_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_falcon7b_tests" + + env pytest models/demos/falcon7b/tests -m "model_perf_t3000" + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" +} + +run_t3000_mixtral_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_mixtral_tests" + + env pytest models/demos/t3000/mixtral8x7b/tests -m "model_perf_t3000" + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" +} + +run_t3000_llama2_70b_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_llama2_70b_tests" + + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete" +} + +run_t3000_llm_tests() { + # Run falcon7b tests + run_t3000_falcon7b_tests + + # Run mixtral tests + run_t3000_mixtral_tests + + # Run llama2-70b tests + #run_t3000_llama2_70b_tests + + # Merge all the generated reports + env python models/perf/merge_perf_results.py +} + +run_t3000_cnn_tests() { + # Merge all the generated reports + env python models/perf/merge_perf_results.py +} + +main() { + # Parse the arguments + while [[ $# -gt 0 ]]; do + case $1 in + --pipeline-type) + pipeline_type=$2 + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac + shift + done + + if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$pipeline_type" ]]; then + echo "--pipeline-type cannot be empty" 1>&2 + exit 1 + fi + + # Run all tests + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + + if [[ "$pipeline_type" == "llm_model_perf_t3000_device" ]]; then + run_t3000_llm_tests + elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then + run_t3000_cnn_tests + else + echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1 + exit 1 + fi +} + +main "$@" diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh new file mode 100755 index 00000000000..677b9d7cdf1 --- /dev/null +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -0,0 +1,127 @@ + +#/bin/bash +set -eo pipefail + +run_t3000_ttmetal_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_ttmetal_tests" + + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_ttmetal_tests $duration seconds to complete" +} + +run_t3000_ttnn_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_ttnn_tests" + + pytest tests/ttnn/unit_tests/test_multi_device.py + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_ttnn_tests $duration seconds to complete" +} + +run_t3000_falcon7b_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_falcon7b_tests" + + pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py + pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py + pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py + #pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" +} + +run_t3000_falcon40b_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_falcon40b_tests" + + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" +} + +run_t3000_mixtral_tests() { + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_mixtral_tests" + + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py + pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[1-1-pcc] + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" +} + +run_t3000_tests() { + # Run ttmetal tests + run_t3000_ttmetal_tests + + # Run ttnn tests + run_t3000_ttnn_tests + + # Run falcon7b tests + run_t3000_falcon7b_tests + + # Run falcon40b tests + run_t3000_falcon40b_tests + + # Run mixtral tests + run_t3000_mixtral_tests +} + +main() { + if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 + fi + + # Run all tests + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + + run_t3000_tests +} + +main "$@" diff --git a/tests/scripts/tg/run_pre_post_commit_regressions_tg.sh b/tests/scripts/tg/run_pre_post_commit_regressions_tg.sh deleted file mode 100755 index 3d8f32fdf8e..00000000000 --- a/tests/scripts/tg/run_pre_post_commit_regressions_tg.sh +++ /dev/null @@ -1,17 +0,0 @@ - -#/bin/bash - -set -eo pipefail - -if [[ -z "$TT_METAL_HOME" ]]; then - echo "Must provide TT_METAL_HOME in environment" 1>&2 - exit 1 -fi - -if [[ -z "$ARCH_NAME" ]]; then - echo "Must provide ARCH_NAME in environment" 1>&2 - exit 1 -fi - -# Write tests here! -echo "Fill me!" diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh new file mode 100755 index 00000000000..5a5c93de2ae --- /dev/null +++ b/tests/scripts/tg/run_tg_unit_tests.sh @@ -0,0 +1,28 @@ + +#/bin/bash +set -eo pipefail + +run_tg_tests() { + # Write tests here + echo "LOG_METAL: Fill me!" +} + +main() { + if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 + fi + + # Run all tests + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + + run_tg_tests +} + +main "$@" diff --git a/tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh b/tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh deleted file mode 100755 index 3d8f32fdf8e..00000000000 --- a/tests/scripts/tgg/run_pre_post_commit_regressions_tgg.sh +++ /dev/null @@ -1,17 +0,0 @@ - -#/bin/bash - -set -eo pipefail - -if [[ -z "$TT_METAL_HOME" ]]; then - echo "Must provide TT_METAL_HOME in environment" 1>&2 - exit 1 -fi - -if [[ -z "$ARCH_NAME" ]]; then - echo "Must provide ARCH_NAME in environment" 1>&2 - exit 1 -fi - -# Write tests here! -echo "Fill me!" diff --git a/tests/scripts/tgg/run_tgg_unit_tests.sh b/tests/scripts/tgg/run_tgg_unit_tests.sh new file mode 100755 index 00000000000..b8c209a22cb --- /dev/null +++ b/tests/scripts/tgg/run_tgg_unit_tests.sh @@ -0,0 +1,28 @@ + +#/bin/bash +set -eo pipefail + +run_tgg_tests() { + # Write tests here + echo "LOG_METAL: Fill me!" +} + +main() { + if [[ -z "$TT_METAL_HOME" ]]; then + echo "Must provide TT_METAL_HOME in environment" 1>&2 + exit 1 + fi + + if [[ -z "$ARCH_NAME" ]]; then + echo "Must provide ARCH_NAME in environment" 1>&2 + exit 1 + fi + + # Run all tests + cd $TT_METAL_HOME + export PYTHONPATH=$TT_METAL_HOME + + run_tgg_tests +} + +main "$@"