Skip to content

Multi-Nebula model perf regressions and output report #69

Multi-Nebula model perf regressions and output report

Multi-Nebula model perf regressions and output report #69

name: "Multi-Nebula model perf regressions and output report"
on:
workflow_dispatch:
schedule:
- cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours
jobs:
multi-device-models-perf:
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
runner-info: [
# N300 2x4
# NOTE: Never use arch-wormhole_b0 tags, however we're using it here because this machine is used by devs during the day
# We don't want other CI runs to interrupt dev flows. However, we need to fix this once we have more 2x4 machines dedicated to CI
{name: "n300-2x4", arch: wormhole_b0, runs-on: ["perf-t3000", "arch-wormhole_b0", "multi-chip-num-pcie-4", "multi-chip-num-chips-8"], machine-type: "bare_metal"},
]
model-type: [llm_javelin, cnn_javelin, other]
name: "${{ matrix.model-type }} ${{ matrix.runner-info.arch }}"
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.runner-info.arch }}
CONFIG: ci
LOGURU_LEVEL: INFO
TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
environment: dev
runs-on: ${{ matrix.runner-info.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Enable Performance mode
run: |
sudo cpupower frequency-set -g performance
- name: Ensure weka mount is active
run: |
sudo systemctl restart mnt-MLPerf.mount
sudo /etc/rc.local
ls -al /mnt/MLPerf/bit_error_tests
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- name: Build tt-metal and libs
run: PYTHON_ENV_DIR=$(pwd)/build/python_env ./build_metal.sh
- name: Run performance regressions
timeout-minutes: 60
run: |
source build/python_env/bin/activate
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.runner-info.machine-type }}_multi_device
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
run: |
ls -hal
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.runner-info.arch }}-${{ matrix.runner-info.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Disable Performance mode
if: always()
run: |
sudo cpupower frequency-set -g ondemand