From f190547583e0246077fd0d19c9c1f0251662b39a Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Mon, 11 Nov 2024 06:55:34 +0000 Subject: [PATCH] Add N300 perf to pipeline --- .github/workflows/perf-models-impl.yaml | 43 +++++++++++++++++++++---- .github/workflows/perf-models.yaml | 8 ++++- tests/scripts/run_performance.sh | 21 ++++++++++++ 3 files changed, 65 insertions(+), 7 deletions(-) diff --git a/.github/workflows/perf-models-impl.yaml b/.github/workflows/perf-models-impl.yaml index f78bddeae70b..d962795a0ad7 100644 --- a/.github/workflows/perf-models-impl.yaml +++ b/.github/workflows/perf-models-impl.yaml @@ -15,6 +15,9 @@ jobs: {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal"}, ] model-type: [llm_javelin, cnn_javelin, other] + include: + - test-info: {name: "N300 Perf tests", arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal", tracy: true, owner_id: ULMEPM2MA} # Sean Nijjar + model-type: CCL name: "${{ matrix.model-type }} ${{ matrix.test-info.name }}" env: TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} @@ -32,14 +35,26 @@ jobs: run: | echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - - uses: actions/download-artifact@v4 + - name: Download profiler build artifact + id: download-profiler-artifact + if: ${{ matrix.test-group.tracy }} + uses: actions/download-artifact@v4 + with: + name: TTMetal_build_${{ matrix.test-group.arch }}_profiler + continue-on-error: true + - name: Download build artifact + id: download-artifact + if: ${{ !matrix.test-group.tracy }} + uses: actions/download-artifact@v4 with: name: TTMetal_build_${{ matrix.test-info.arch }} - name: Extract files + if: ${{ matrix.test-info.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-info.tracy }} run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run performance regressions id: performance_tests + if: ${{ matrix.test-info.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-info.tracy }} timeout-minutes: 70 run: | source ${{ github.workspace }}/python_env/bin/activate @@ -51,12 +66,28 @@ jobs: # slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }} - name: Check perf report exists id: check-perf-report - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (matrix.test-info.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-info.tracy) }} run: | - ls -hal - export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv - ls -hal $PERF_REPORT_FILENAME - echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" + TODAY=$(date +%Y_%m_%d) + PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv" + PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv" + if [ "${{ matrix.test-group.tracy }}" == "true" ]; then + if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then + echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL" + echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT" + else + echo "No CCL perf report found for today." + exit 1 + fi + else + if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then + echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS" + echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT" + else + echo "No Models perf report found for today." + exit 1 + fi + fi - name: Upload perf report if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} uses: actions/upload-artifact@v4 diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml index c7f0d9313a59..46e8ec5769d5 100644 --- a/.github/workflows/perf-models.yaml +++ b/.github/workflows/perf-models.yaml @@ -10,7 +10,13 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + build-artifact-profiler: + uses: ./.github/workflows/build-artifact.yaml + with: + arch: '["wormhole_b0"]' + tracy: true + secrets: inherit models-perf: - needs: build-artifact + needs: [build-artifact, build-artifact-profiler] uses: ./.github/workflows/perf-models-impl.yaml secrets: inherit diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 80db980c9935..a1a57dc1681e 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -37,6 +37,25 @@ run_perf_models_other() { env python models/perf/merge_perf_results.py } +run_n300_ccl_all_gather_perf_tests() { + # Record the start time + fail=0 + start_time=$(date +%s) + + echo "LOG_METAL: Running run_n300_ccl_all_gather_perf_tests" + + tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t n300 + fail+=$? + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_n300_ccl_all_gather_perf_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_perf_models_llm_javelin() { local tt_arch=$1 local test_marker=$2 @@ -182,6 +201,8 @@ main() { run_perf_models_cnn_javelin "$tt_arch" "$test_marker" elif [[ "$pipeline_type" == *"other_models_performance"* ]]; then run_perf_models_other "$tt_arch" "$test_marker" + elif [[ "$pipeline_type" == "CCL_models_performance"* ]]; then + run_n300_ccl_all_gather_perf_tests else echo "$pipeline_type is not recoognized performance pipeline" 2>&1 exit 1