Add N300 perf to pipeline

tenstorrent · Nov 11, 2024 · a47700c · a47700c
1 parent fbc8a9d
commit a47700c
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 7 deletions.
diff --git a/.github/workflows/perf-models-impl.yaml b/.github/workflows/perf-models-impl.yaml
@@ -15,6 +15,9 @@ jobs:
           {name: "N300 WH B0", arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal"},
         ]
         model-type: [llm_javelin, cnn_javelin, other]
+        include:
+          - test-info: {name: "N300 Perf tests", arch: wormhole_b0, runs-on: ["N300", "pipeline-perf", "bare-metal", "in-service"], machine-type: "bare_metal", tracy: true, owner_id: ULMEPM2MA} # Sean Nijjar
+            model-type: CCL
     name: "${{ matrix.model-type }} ${{ matrix.test-info.name }}"
     env:
       TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
@@ -32,14 +35,26 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
-      - uses: actions/download-artifact@v4
+      - name: Download profiler build artifact
+        id: download-profiler-artifact
+        if: ${{ matrix.test-info.tracy }}
+        uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-info.arch }}_profiler
+        continue-on-error: true
+      - name: Download build artifact
+        id: download-artifact
+        if: ${{ !matrix.test-info.tracy }}
+        uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_${{ matrix.test-info.arch }}
       - name: Extract files
+        if: ${{ matrix.test-info.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-info.tracy }}
         run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run performance regressions
         id: performance_tests
+        if: ${{ matrix.test-info.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-info.tracy }}
         timeout-minutes: 70
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
@@ -51,12 +66,28 @@ jobs:
       #    slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
       - name: Check perf report exists
         id: check-perf-report
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && (matrix.test-info.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-info.tracy) }}
         run: |
-          ls -hal
-          export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
-          ls -hal $PERF_REPORT_FILENAME
-          echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
+          TODAY=$(date +%Y_%m_%d)
+          PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
+          PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
+          if [ "${{ matrix.test-info.tracy }}" == "true" ]; then
+            if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
+              echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
+            else
+              echo "No CCL perf report found for today."
+              exit 1
+            fi
+          else
+            if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
+              echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
+            else
+              echo "No Models perf report found for today."
+              exit 1
+            fi
+          fi
       - name: Upload perf report
         if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
         uses: actions/upload-artifact@v4

diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml
@@ -10,7 +10,13 @@ jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
     secrets: inherit
+  build-artifact-profiler:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+      tracy: true
+    secrets: inherit
   models-perf:
-    needs: build-artifact
+    needs: [build-artifact, build-artifact-profiler]
     uses: ./.github/workflows/perf-models-impl.yaml
     secrets: inherit
diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh
@@ -37,6 +37,25 @@ run_perf_models_other() {
     env python models/perf/merge_perf_results.py
 }
 
+run_n300_ccl_all_gather_perf_tests() {
+  # Record the start time
+  fail=0
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_n300_ccl_all_gather_perf_tests"
+
+  tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t n300
+  fail+=$?
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_n300_ccl_all_gather_perf_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
+}
+
 run_perf_models_llm_javelin() {
     local tt_arch=$1
     local test_marker=$2
@@ -182,6 +201,8 @@ main() {
         run_perf_models_cnn_javelin "$tt_arch" "$test_marker"
     elif [[ "$pipeline_type" == *"other_models_performance"* ]]; then
         run_perf_models_other "$tt_arch" "$test_marker"
+    elif [[ "$pipeline_type" == "CCL_models_performance"* ]]; then
+        run_n300_ccl_all_gather_perf_tests
     else
         echo "$pipeline_type is not recoognized performance pipeline" 2>&1
         exit 1