Skip to content

Commit

Permalink
Add all gather perf to TG pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Aswinmcw committed Nov 13, 2024
1 parent f38e3ba commit 1519429
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 11 deletions.
47 changes: 41 additions & 6 deletions .github/workflows/tg-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ jobs:
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""'
},
{ name: "t3k CCL all_gather perf tests",
arch: wormhole_b0,
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type ccl_all_gather_perf_tg_device --dispatch-mode ""',
timeout: 75,
tracy: true,
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
owner_id: ULMEPM2MA}, # Sean Nijjar
]
name: ${{ matrix.test-group.name }}
env:
Expand All @@ -41,13 +48,25 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
- name: Download profiler build artifact
id: download-profiler-artifact
if: ${{ matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
continue-on-error: true
- name: Download build artifact
id: download-artifact
if: ${{ !matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
timeout-minutes: 60
run: |
source ${{ github.workspace }}/python_env/bin/activate
Expand All @@ -56,12 +75,28 @@ jobs:
${{ matrix.test-group.cmd }}
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }}
run: |
ls -hal
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
TODAY=$(date +%Y_%m_%d)
PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
if [ "${{ matrix.test-info.tracy }}" == "true" ]; then
if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
else
echo "No CCL perf report found for today."
exit 1
fi
else
if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
else
echo "No Models perf report found for today."
exit 1
fi
fi
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/tg-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@ jobs:
with:
arch: '["wormhole_b0"]'
secrets: inherit
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
tg-model-perf-tests:
needs: build-artifact
needs: [build-artifact, build-artifact-profiler]
secrets: inherit
uses: ./.github/workflows/tg-model-perf-tests-impl.yaml
8 changes: 8 additions & 0 deletions tests/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ model_perf_tg_device() {

./tests/scripts/tg/run_tg_model_perf_tests.sh --pipeline-type "$pipeline_type"
}

# Run ccl model perf tests
ccl_perf_tg_device() {

./tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t tg
}
##########################TG##########################

##########################TGG##########################
Expand Down Expand Up @@ -321,6 +327,8 @@ run_pipeline_tests() {
demos_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
elif [[ $pipeline_type == *"model_perf_tg_device" ]]; then
model_perf_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
elif [[ $pipeline_type == "ccl_all_gather_perf_tg_device" ]]; then
ccl_perf_tg_device
# TGG pipelines
elif [[ $pipeline_type == "unit_tgg_device" ]]; then
unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
Expand Down
6 changes: 4 additions & 2 deletions tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,12 @@ def calculate_bandwidth(row):
group_df.rename(columns={"INPUT_0_LAYOUT": "Layout", "INPUT_0_DATATYPE": "Data Type"}, inplace=True)

group_df["Input Shape"] = group_df.apply(
lambda row: f"[{row['INPUT_0_W']}, {row['INPUT_0_Z']}, {row['INPUT_0_Y']}, {row['INPUT_0_X']}]", axis=1
lambda row: f"[{int(row['INPUT_0_W'])}, {int(row['INPUT_0_Z'])}, {int(row['INPUT_0_Y'])}, {int(row['INPUT_0_X'])}]",
axis=1,
)
group_df["Output Shape"] = group_df.apply(
lambda row: f"[{row['OUTPUT_0_W']}, {row['OUTPUT_0_Z']}, {row['OUTPUT_0_Y']}, {row['OUTPUT_0_X']}]", axis=1
lambda row: f"[{int(row['OUTPUT_0_W'])}, {int(row['OUTPUT_0_Z'])}, {int(row['OUTPUT_0_Y'])}, {int(row['OUTPUT_0_X'])}]",
axis=1,
)
group_df["Cycles Count"] = group_df["DEVICE FW END CYCLE"] - group_df["DEVICE FW START CYCLE"]
group_df[["Op BW [GB/s]", "Link BW [GB/s]"]] = group_df.apply(calculate_bandwidth, axis=1, result_type="expand")
Expand Down
5 changes: 3 additions & 2 deletions tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,10 @@ def test_reduce_scatter_on_n300(
],
)
@pytest.mark.parametrize("replication_factor", [8])
@pytest.mark.parametrize("num_iters", [20])
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True)
@pytest.mark.parametrize("device_params", [{"trace_region_size": 532480}], indirect=True)
def test_all_gather_on_tg(
mesh_device,
num_devices,
Expand All @@ -311,7 +312,7 @@ def test_all_gather_on_tg(
function_level_defaults,
enable_async,
replication_factor,
num_iters=1,
num_iters,
):
run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
Expand Down

0 comments on commit 1519429

Please sign in to comment.