-
Notifications
You must be signed in to change notification settings - Fork 91
86 lines (84 loc) · 3.47 KB
/
perf-models.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
name: "Model perf regressions and output report"
on:
workflow_dispatch:
schedule:
- cron: "0 2,7,10,14,17,20,23 * * *"
workflow_call:
jobs:
build-docker-artifact:
uses: ./.github/workflows/build-docker-artifact.yaml
secrets: inherit
build-artifact:
needs: build-docker-artifact
uses: ./.github/workflows/build-artifact.yaml
secrets: inherit
models-perf:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
test-info: [
{name: "GS", arch: grayskull, runs-on: ["perf-grayskull", "self-reset"], machine-type: "bare_metal"},
{name: "N300 WH B0", arch: wormhole_b0, runs-on: ["perf-wormhole_b0", "self-reset"], machine-type: "bare_metal"},
]
model-type: [llm_javelin, cnn_javelin, other]
name: "${{ matrix.model-type }} ${{ matrix.test-info.name }}"
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-info.arch }}
LOGURU_LEVEL: INFO
TTNN_CONFIG_OVERRIDES: '{"enable_fast_runtime_mode": true}'
environment: dev
runs-on: ${{ matrix.test-info.runs-on }}
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/[email protected]
- name: Enable Performance mode
run: |
sudo cpupower frequency-set -g performance
- name: Ensure weka mount is active
run: |
sudo systemctl restart mnt-MLPerf.mount
sudo /etc/rc.local
ls -al /mnt/MLPerf/bit_error_tests
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-info.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar
- uses: ./.github/actions/install-python-deps
- uses: ./.github/actions/load-docker-image
- name: Run performance regressions
id: performance_tests
timeout-minutes: 30
run: |
./scripts/docker/run_docker_cmd.sh -e TTNN_CONFIG_OVERRIDES=${TTNN_CONFIG_OVERRIDES} -- \
/bin/bash -c "source python_env/bin/activate \
&& ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }}"
- uses: ./.github/actions/slack-report
if: ${{ steps.performance_tests.outcome != 'success' }}
with:
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
run: |
ls -hal
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.model-type }}-${{ matrix.test-info.arch }}-${{ matrix.test-info.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Disable Performance mode
if: always()
run: |
sudo cpupower frequency-set -g ondemand