Multi-Nebula model perf regressions and output report #70
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "Multi-Nebula model perf regressions and output report" | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours | |
jobs: | |
multi-device-models-perf: | |
strategy: | |
# Do not fail-fast because we need to ensure all tests go to completion | |
# so we try not to get hanging machines | |
fail-fast: false | |
matrix: | |
runner-info: [ | |
# N300 2x4 | |
# NOTE: Never use arch-wormhole_b0 tags, however we're using it here because this machine is used by devs during the day | |
# We don't want other CI runs to interrupt dev flows. However, we need to fix this once we have more 2x4 machines dedicated to CI | |
{name: "n300-2x4", arch: wormhole_b0, runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], machine-type: "bare_metal"}, | |
] | |
model-type: [llm_javelin, cnn_javelin, other] | |
name: "${{ matrix.model-type }} ${{ matrix.runner-info.arch }}" | |
env: | |
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} | |
ARCH_NAME: ${{ matrix.runner-info.arch }} | |
CONFIG: ci | |
LOGURU_LEVEL: INFO | |
TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}' | |
environment: dev | |
runs-on: ${{ matrix.runner-info.runs-on }} | |
steps: | |
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected] | |
- name: Enable Performance mode | |
run: | | |
sudo cpupower frequency-set -g performance | |
- name: Ensure weka mount is active | |
run: | | |
sudo systemctl restart mnt-MLPerf.mount | |
sudo /etc/rc.local | |
ls -al /mnt/MLPerf/bit_error_tests | |
- name: Set up dynamic env vars for build | |
run: | | |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV | |
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV | |
- name: Build tt-metal and libs | |
run: PYTHON_ENV_DIR=$(pwd)/build/python_env ./build_metal.sh | |
- name: Run performance regressions | |
timeout-minutes: 60 | |
run: | | |
source build/python_env/bin/activate | |
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }}_multi_device | |
- name: Check perf report exists | |
id: check-perf-report | |
if: ${{ !cancelled() }} | |
run: | | |
ls -hal | |
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv | |
ls -hal $PERF_REPORT_FILENAME | |
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" | |
- name: Upload perf report | |
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} | |
uses: actions/upload-artifact@v4 | |
with: | |
name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }} | |
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" | |
- name: Disable Performance mode | |
if: always() | |
run: | | |
sudo cpupower frequency-set -g ondemand |