From ec0cc14c288af65a93888da7975edaf3b4c8e7ba Mon Sep 17 00:00:00 2001 From: Austin Ho <109362939+tt-aho@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:29:46 -0400 Subject: [PATCH] Enable T3K Resnet Tests (#11030) * #10244: Fix optional output tensor handling for reshard * #0: Add enable_async_mode device fixture and refactor use_program_cache * #0: Cleanup ttnn_resnet single device test files * #10244: Fix multi-device api issues for ttnn resnet tests, and add them to ci * #10244: Add E2E performance tests for ttnn_resnet on t3000 * #10244: Add t3000 perf results for ttnn_resnet to README * #0: Remove initial space when parsing perf csv * #0: Increase timeout for Nightly N300 WH-only models job due to some ci machines being slower than others --- ...-dispatch-full-regressions-and-models.yaml | 2 +- .github/workflows/t3000-frequent-tests.yaml | 16 +- .github/workflows/t3000-model-perf-tests.yaml | 2 + README.md | 16 +- conftest.py | 49 +- models/demos/ttnn_resnet/README.md | 5 + .../multi_device/test_perf_ttnn_resnet.py | 599 ++++++++++++++++++ .../test_ttnn_resnet50_performant.py | 114 ++-- .../tests/test_perf_device_ttnn_resnet.py | 4 +- .../tests/test_perf_ttnn_resnet.py | 45 +- .../tests/test_ttnn_resnet50_performant.py | 7 +- .../tests/ttnn_resnet_test_infra.py | 20 +- models/perf/perf_utils.py | 2 +- .../scripts/t3000/run_t3000_frequent_tests.sh | 21 + .../t3000/run_t3000_model_perf_tests.sh | 21 + .../tt_dnn/op_library/sharded/sharded_op.hpp | 4 +- 16 files changed, 798 insertions(+), 129 deletions(-) create mode 100644 models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml index b2b9ebb3cf1..796065f4844 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml @@ -23,7 +23,7 @@ jobs: { name: "Common models N300 WH B0", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_common_models.sh, timeout: 40 }, { name: "GS-only ttnn nightly", arch: grayskull, cmd: tests/scripts/single_card/nightly/run_ttnn.sh, timeout: 40 }, { name: "GS-only models", arch: grayskull, cmd: tests/scripts/single_card/nightly/run_gs_only.sh, timeout: 40 }, - { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 40 }, + { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 50 }, { name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, { name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, # #9945: Skip SD for now diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml index b385aa9da51..2e53c1de2ca 100644 --- a/.github/workflows/t3000-frequent-tests.yaml +++ b/.github/workflows/t3000-frequent-tests.yaml @@ -17,18 +17,20 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "t3k tteager tests", arch: wormhole_b0, cmd: run_t3000_tteager_tests, timeout: 60, + { name: "t3k tteager tests", arch: wormhole_b0, cmd: run_t3000_tteager_tests, timeout: 60, runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: ULMEPM2MA}, #Sean Nijjar - { name: "t3k ethernet tests", arch: wormhole_b0, cmd: run_t3000_ethernet_tests, timeout: 60, + { name: "t3k ethernet tests", arch: wormhole_b0, cmd: run_t3000_ethernet_tests, timeout: 60, runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: ULMEPM2MA}, #Sean Nijjar - { name: "t3k trace stress tests", arch: wormhole_b0, cmd: run_t3000_trace_stress_tests, timeout: 120, + { name: "t3k trace stress tests", arch: wormhole_b0, cmd: run_t3000_trace_stress_tests, timeout: 120, runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U03NG0A5ND7}, #Aditya Saigal - { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 120, - runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U04S2UV6L8N}, #Sofija Jovic - { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, + { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 120, + runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U04S2UV6L8N}, #Sofija Jovic + { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U03FJB5TM5Y}, #Colman Glagovich - { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, + { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U03PUAKE719}, #Miguel Tairum Cruz + { name: "t3k resnet tests", arch: wormhole_b0, cmd: run_t3000_resnet_tests, timeout: 30, + runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U013121KDH9}, #Austin Ho ] name: ${{ matrix.test-group.name }} env: diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml index f6f5cae00a4..17599d68240 100644 --- a/.github/workflows/t3000-model-perf-tests.yaml +++ b/.github/workflows/t3000-model-perf-tests.yaml @@ -25,6 +25,8 @@ jobs: runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], owner_id: U03FJB5TM5Y}, # Colman Glagovich { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], owner_id: U053W15B6JF}, # Djordje Ivanovic + { name: "t3k LLM resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, + runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], owner_id: U013121KDH9}, # Austin Ho #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run? ] name: ${{ matrix.test-group.name }} diff --git a/README.md b/README.md index e366f098ac0..d9984b58fcc 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,12 @@ > > Furthermore, all performance numbers here are run or based off an N300 Wormhole card. -| Model | Last Verified Release | Gen. Token [3] | Batch | End-to-end throughput [1] | Device throughput [2] | Target | +| Model | Last Verified Release | Gen. Token [3] | Batch | End-to-end throughput [1] | Device throughput [2] | Target | |----------------------------------------------------------------------------------------|---------------------------------------------------------------------------|--------------------|----------------------|--------------------------------|------------------------------|----------------| -| [Falcon7B](./models/demos/wormhole/falcon7b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th | 32 | 13.7 t/s/u - 438 t/s | 19.5 t/s/u - 624 t/s | 26 | -| [Mistral-7B](./models/demos/wormhole/mistral7b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th | 32 | 9.9 t/s/u - 317 t/s | 11.0 t/s/u - 352 t/s | 25 | +| [Falcon7B](./models/demos/wormhole/falcon7b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th | 32 | 13.7 t/s/u - 438 t/s | 19.5 t/s/u - 624 t/s | 26 | +| [Mistral-7B](./models/demos/wormhole/mistral7b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th | 32 | 9.9 t/s/u - 317 t/s | 11.0 t/s/u - 352 t/s | 25 | | [Mamba-2.8B](./models/demos/wormhole/mamba) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | any | 32 | 11.6 t/s/u - 371 t/s | 16.5 t/s/u - 528 t/s | 41 | -| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th | 8 | 8.3 t/s/u - 66.0 t/s | 9.7 t/s/u - 77.9 t/s | 23 | +| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th | 8 | 8.3 t/s/u - 66.0 t/s | 9.7 t/s/u - 77.9 t/s | 23 | | [BERT-Large](./models/demos/metal_BERT_large_11/) (sen/s) [4] | | | 8 | 270 | 340 | 400 | | [Stable Diffusion 1.4](./models/demos/wormhole/stable_diffusion) 512x512 (sec/img) [5] | | | 1 | 6 | 5 | 3 | | [ResNet-50](./models/demos/ttnn_resnet) (fps) | | | 16 | 4,300 | 5,550 | 7,000 | @@ -66,14 +66,14 @@ ## TT-QuietBox & TT-LoudBox (2x4 mesh of WHs) Models -| Model | Last Verified Release | Technique | Gen. Token [3] | Batch | End-to-end throughput [1] | Device throughput [2] | Target | +| Model | Last Verified Release | Technique | Gen. Token [3] | Batch | End-to-end throughput [1] | Device throughput [2] | Target | |----------------------------------------------------|---------------------------------------------------------------------------|--------------------|---------------------|-----------------------|------------------------------|------------------------------|-----------------| -| [Falcon7B](./models/demos/t3000/falcon7b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Data Parallel | 129th | 256 | 7.6 t/s/u - 1950 t/s | 19.5 t/s/u - 4990 t/s | 26 t/s/u | +| [Falcon7B](./models/demos/t3000/falcon7b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Data Parallel | 129th | 256 | 7.6 t/s/u - 1950 t/s | 19.5 t/s/u - 4990 t/s | 26 t/s/u | | [LLaMA-2-70B](./models/demos/t3000/llama2_70b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel | 129th | 32 | 10.4 t/s/u - 333 t/s | 16.6 t/s/u - 531 t/s | 20 t/s/u | | [LLaMA-3.1-70B](./models/demos/t3000/llama3_70b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel | 129th | 32 | 10.4 t/s/u - 333 t/s | 15.8 t/s/u - 506 t/s | 20 t/s/u | -| [Falcon40B](./models/demos/t3000/falcon40b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel | 129th | 32 | 5.3 t/s/u - 168 t/s | 12.2 t/s/u - 390 t/s | 36 t/s/u | +| [Falcon40B](./models/demos/t3000/falcon40b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel | 129th | 32 | 5.3 t/s/u - 168 t/s | 12.2 t/s/u - 390 t/s | 36 t/s/u | | [Mixtral7Bx8](./models/demos/t3000/mixtral8x7b) | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel | 129th | 32 | 13.3 t/s/u - 426 t/s | 21.4 t/s/u - 685 t/s | 33 t/s/u | -| ResNet50 | | Data Parallel | coming soon | | | | | +| [ResNet-50](./models/demos/ttnn_resnet) | | Data Parallel | | 128 | 31,700 | 44,400 | 56,000 | ## Model Updates For the latest model updates and features, please see [MODEL_UPDATES.md](models/MODEL_UPDATES.md) diff --git a/conftest.py b/conftest.py index 597ae033b8f..06aa9ccfd4c 100644 --- a/conftest.py +++ b/conftest.py @@ -271,36 +271,47 @@ def reset_default_device(): ttl.device.SetDefaultDevice(device) -@pytest.fixture(scope="function") -def use_program_cache(request): - import tt_lib as ttl - +def get_devices(request): if "device" in request.fixturenames: - dev = request.getfixturevalue("device") - dev.enable_program_cache() + devices = [request.getfixturevalue("device")] elif "all_devices" in request.fixturenames: devices = request.getfixturevalue("all_devices") - for dev in devices: - dev.enable_program_cache() elif "pcie_devices" in request.fixturenames: devices = request.getfixturevalue("pcie_devices") - for dev in devices: - dev.enable_program_cache() elif "device_mesh" in request.fixturenames: - mesh = request.getfixturevalue("device_mesh") - for device_id in mesh.get_device_ids(): - mesh.get_device(device_id).enable_program_cache() + devices = request.getfixturevalue("device_mesh").get_devices() elif "t3k_device_mesh" in request.fixturenames: - mesh = request.getfixturevalue("t3k_device_mesh") - for device_id in mesh.get_device_ids(): - mesh.get_device(device_id).enable_program_cache() + devices = request.getfixturevalue("t3k_device_mesh").get_devices() elif "pcie_device_mesh" in request.fixturenames: - mesh = request.getfixturevalue("pcie_device_mesh") - for device_id in mesh.get_device_ids(): - mesh.get_device(device_id).enable_program_cache() + devices = request.getfixturevalue("pcie_device_mesh").get_devices() else: + devices = [] + return devices + + +@pytest.fixture(scope="function") +def use_program_cache(request): + devices = get_devices(request) + if not devices: logger.warning("No device fixture found to apply program cache to: PROGRAM CACHE DISABLED") + for dev in devices: + dev.enable_program_cache() yield + for dev in devices: + dev.disable_and_clear_program_cache() + + +@pytest.fixture(scope="function") +def enable_async_mode(request): + devices = get_devices(request) + if not devices: + logger.warning("No device fixture found to apply async mode to: ASYNC MODE DISABLED") + + for dev in devices: + dev.enable_async(request.param) + yield request.param + for dev in devices: + dev.enable_async(False) @pytest.fixture(scope="function") diff --git a/models/demos/ttnn_resnet/README.md b/models/demos/ttnn_resnet/README.md index 751f9f7d115..d14673ff3e2 100644 --- a/models/demos/ttnn_resnet/README.md +++ b/models/demos/ttnn_resnet/README.md @@ -8,8 +8,13 @@ Our ImageProcessor on the other hand is based on `microsoft/resnet-50` from hugg ## Performance +### Single Device + To obtain device performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./tt_metal/tools/profiler/profile_this.py -c "pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference[16-act_dtype0-weight_dtype0-math_fidelity0-device_params0]"` This will generate a CSV report under `/generated/profiler/reports/ops/`. The report file name is logged in the run output. + For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal[16-0.004-25-device_params0]`. This will generate a CSV with the timings and throughputs. Expected end-to-end perf: For batch = 16, it is about `4300 fps` currently. This may vary machine to machine. + +### T3000 ++ For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest test_perf_trace_2cqs_t3000[wormhole_b0-True-16-True-0.0043-60-device_params0]`. This will generate a CSV with the timings and throughputs. +Expected end-to-end perf: For batch = 16 per device, or batch 128 in total, it is about `31,700 fps` currently. This may vary machine to machine. diff --git a/models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py new file mode 100644 index 00000000000..72e08d03921 --- /dev/null +++ b/models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py @@ -0,0 +1,599 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +from loguru import logger +from transformers import AutoImageProcessor +import pytest +import ttnn +import tt_lib +from ttnn.model_preprocessing import ( + preprocess_model_parameters, +) + +from models.utility_functions import ( + profiler, + disable_persistent_kernel_cache, + run_for_wormhole_b0, +) + +from models.perf.perf_utils import prep_perf_report + +from models.demos.ttnn_resnet.tests.multi_device.test_ttnn_resnet50_performant import ( + setup_l1_sharded_input, + setup_dram_sharded_input, +) +from models.demos.ttnn_resnet.tests.ttnn_resnet_test_infra import load_resnet50_model +from models.demos.ttnn_resnet.tt.custom_preprocessing import create_custom_mesh_preprocessor +from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_new_conv_api import resnet50 + +try: + from tracy import signpost + + use_signpost = True +except ModuleNotFoundError: + use_signpost = False + + +def create_event(device): + event = [] + if isinstance(device, ttnn.Device): + event.append(tt_lib.device.CreateEvent()) + else: + for dev in device.get_device_ids(): + event.append(tt_lib.device.CreateEvent()) + return event + + +def wait_for_event(device, cq_id, event): + if isinstance(device, ttnn.Device): + tt_lib.device.WaitForEvent(device, cq_id, event) + else: + for dev, eve in zip(device.get_device_ids(), event): + tt_lib.device.WaitForEvent(device.get_device(dev), cq_id, eve) + + +def record_event(device, cq_id, event): + if isinstance(device, ttnn.Device): + tt_lib.device.RecordEvent(device, cq_id, event) + else: + for dev, eve in zip(device.get_device_ids(), event): + tt_lib.device.RecordEvent(device.get_device(dev), cq_id, eve) + + +def buffer_address(tensor): + addr = [] + for ten in ttnn.get_device_tensors(tensor): + addr.append(ten.buffer_address()) + return addr + + +def dump_device_profiler(device): + if isinstance(device, ttnn.Device): + tt_lib.device.DumpDeviceProfiler(device) + else: + for dev in device.get_device_ids(): + tt_lib.device.DumpDeviceProfiler(device.get_device(dev)) + + +# TODO: Create ttnn apis for these +ttnn.create_event = create_event +ttnn.wait_for_event = wait_for_event +ttnn.record_event = record_event +ttnn.buffer_address = buffer_address +ttnn.dump_device_profiler = dump_device_profiler + +model_config = { + "MATH_FIDELITY": ttnn.MathFidelity.LoFi, + "WEIGHTS_DTYPE": ttnn.bfloat8_b, + "ACTIVATIONS_DTYPE": ttnn.bfloat8_b, +} + + +def run_model( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations +): + ops_parallel_config = {} + tt_inputs_host, input_mem_config = setup_l1_sharded_input( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer + ) + profiler.start("compile") + tt_inputs = tt_inputs_host.to(device, input_mem_config) + _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True) + profiler.end("compile") + ttnn.dump_device_profiler(device) + + profiler.start("cache") + tt_inputs = tt_inputs_host.to(device, input_mem_config) + _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True) + profiler.end("cache") + ttnn.dump_device_profiler(device) + + for iter in range(0, num_warmup_iterations): + tt_inputs = tt_inputs_host.to(device, input_mem_config) + _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True) + ttnn.dump_device_profiler(device) + + ttnn.synchronize_devices(device) + if use_signpost: + signpost(header="start") + outputs = [] + profiler.start(f"run") + for iter in range(0, num_measurement_iterations): + tt_inputs = tt_inputs_host.to(device, input_mem_config) + outputs.append(ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=False)) + ttnn.synchronize_devices(device) + profiler.end(f"run") + if use_signpost: + signpost(header="stop") + ttnn.dump_device_profiler(device) + + +def run_2cq_model( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations +): + ops_parallel_config = {} + tt_inputs_host, sharded_mem_config_DRAM, input_mem_config = setup_dram_sharded_input( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer + ) + tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM) + op_event = ttnn.create_event(device) + write_event = ttnn.create_event(device) + # Initialize the op event so we can write + ttnn.record_event(device, 0, op_event) + + profiler.start("compile") + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + ttnn.record_event(device, 0, op_event) + _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) + profiler.end("compile") + ttnn.dump_device_profiler(device) + + profiler.start("cache") + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + ttnn.record_event(device, 0, op_event) + _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) + profiler.end("cache") + ttnn.dump_device_profiler(device) + + for iter in range(0, num_warmup_iterations): + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + ttnn.record_event(device, 0, op_event) + _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) + ttnn.dump_device_profiler(device) + + ttnn.synchronize_devices(device) + if use_signpost: + signpost(header="start") + outputs = [] + profiler.start(f"run") + for iter in range(0, num_measurement_iterations): + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + ttnn.record_event(device, 0, op_event) + outputs.append(ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=False)) + ttnn.synchronize_devices(device) + profiler.end(f"run") + if use_signpost: + signpost(header="stop") + ttnn.dump_device_profiler(device) + + +def run_trace_model( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations +): + ops_parallel_config = {} + tt_inputs_host, sharded_mem_config_DRAM, input_mem_config = setup_dram_sharded_input( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer + ) + tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM) + # Compile + profiler.start("compile") + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) + profiler.end("compile") + ttnn.dump_device_profiler(device) + + profiler.start("cache") + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) + profiler.end("cache") + ttnn.dump_device_profiler(device) + + # Capture + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res) + tid = ttnn.begin_trace_capture(device, cq_id=0) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + tt_output_res = tt_resnet50(reshard_out, device, ops_parallel_config) + ttnn.end_trace_capture(device, tid, cq_id=0) + ttnn.dump_device_profiler(device) + + for iter in range(0, num_warmup_iterations): + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res) + ttnn.execute_trace(device, tid, cq_id=0, blocking=False) + _ = ttnn.from_device(tt_output_res, blocking=True) + ttnn.dump_device_profiler(device) + + ttnn.synchronize_devices(device) + if use_signpost: + signpost(header="start") + outputs = [] + profiler.start(f"run") + for iter in range(0, num_measurement_iterations): + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res) + ttnn.execute_trace(device, tid, cq_id=0, blocking=False) + outputs.append(ttnn.from_device(tt_output_res, blocking=False)) + ttnn.synchronize_devices(device) + profiler.end(f"run") + if use_signpost: + signpost(header="stop") + ttnn.dump_device_profiler(device) + + +def run_trace_2cq_model( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations +): + ops_parallel_config = {} + tt_inputs_host, sharded_mem_config_DRAM, input_mem_config = setup_dram_sharded_input( + device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer + ) + tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM) + + op_event = ttnn.create_event(device) + write_event = ttnn.create_event(device) + # Initialize the op event so we can write + ttnn.record_event(device, 0, op_event) + + profiler.start("compile") + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + ttnn.record_event(device, 0, op_event) + _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) + profiler.end("compile") + ttnn.dump_device_profiler(device) + + profiler.start("cache") + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + first_out_addr = ttnn.buffer_address(reshard_out) + ttnn.record_event(device, 0, op_event) + _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) + profiler.end("cache") + ttnn.dump_device_profiler(device) + + # Capture + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) + ttnn.record_event(device, 0, op_event) + + tid = ttnn.begin_trace_capture(device, cq_id=0) + tt_output_res = tt_resnet50(reshard_out, device, ops_parallel_config) + reshard_out = ttnn.allocate_tensor_on_device( + reshard_out.shape, reshard_out.dtype, reshard_out.layout, device, input_mem_config + ) + ttnn.end_trace_capture(device, tid, cq_id=0) + assert first_out_addr == ttnn.buffer_address(reshard_out) + ttnn.dump_device_profiler(device) + + for iter in range(0, num_warmup_iterations): + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + reshard_out = ttnn.experimental.tensor.reshard(tt_image_res, input_mem_config, reshard_out) + ttnn.record_event(device, 0, op_event) + ttnn.execute_trace(device, tid, cq_id=0, blocking=True) + ttnn.dump_device_profiler(device) + + ttnn.synchronize_devices(device) + if use_signpost: + signpost(header="start") + outputs = [] + profiler.start(f"run") + for iter in range(0, num_measurement_iterations): + ttnn.wait_for_event(device, 1, op_event) + ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) + ttnn.record_event(device, 1, write_event) + ttnn.wait_for_event(device, 0, write_event) + # TODO: Add in place support to ttnn to_memory_config + reshard_out = ttnn.experimental.tensor.reshard(tt_image_res, input_mem_config, reshard_out) + ttnn.record_event(device, 0, op_event) + ttnn.execute_trace(device, tid, cq_id=0, blocking=False) + outputs.append(tt_output_res.cpu(blocking=False)) + ttnn.synchronize_devices(device) + profiler.end(f"run") + if use_signpost: + signpost(header="stop") + ttnn.dump_device_profiler(device) + + +def run_perf_resnet( + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device, + model_version, + model_location_generator, +): + profiler.clear() + disable_persistent_kernel_cache() + if device_batch_size <= 2: + pytest.skip("Batch size 1 and 2 are not supported with sharded data") + num_devices = 1 if isinstance(device, ttnn.Device) else device.get_num_devices() + batch_size = device_batch_size * num_devices + first_key = f"first_iter_batchsize{batch_size}" + second_key = f"second_iter_batchsize{batch_size}" + cpu_key = f"ref_key_batchsize{batch_size}" + model_name = "microsoft/resnet-50" + + image = hf_cat_image_sample_input + image_processor = AutoImageProcessor.from_pretrained(model_name) + inputs = image_processor(image, return_tensors="pt") + + inputs = inputs["pixel_values"].bfloat16() + comments = f"{list(inputs.shape)[-2]}x{list(inputs.shape)[-1]}_batchsize{batch_size}" + + inputs1 = inputs + for i in range(batch_size - 1): + inputs = torch.cat((inputs, inputs1), dim=0) + + inputs_mesh_mapper = ttnn.ShardTensorToMesh(device, dim=0) + weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device) + output_mesh_composer = ttnn.ConcatMeshToTensor(device, dim=0) + + torch_resnet50 = load_resnet50_model(model_location_generator) + torch_resnet50.eval() + + parameters = preprocess_model_parameters( + initialize_model=lambda: torch_resnet50, + custom_preprocessor=create_custom_mesh_preprocessor(weights_mesh_mapper), + device=None, + ) + torch_resnet50.to(torch.bfloat16) + + tt_resnet50 = resnet50( + device=device, + parameters=parameters, + batch_size=device_batch_size, + model_config=model_config, + dealloc_input=True, + final_output_mem_config=ttnn.DRAM_MEMORY_CONFIG if "trace" in model_version else ttnn.L1_MEMORY_CONFIG, + mesh_mapper=weights_mesh_mapper, + ) + ttnn.synchronize_devices(device) + + num_warmup_iterations = 5 + num_measurement_iterations = 15 + + with torch.no_grad(): + profiler.start(cpu_key) + logits = torch_resnet50(inputs) + profiler.end(cpu_key) + + tt_inputs = tt_resnet50.preprocessing(inputs, inputs_mesh_mapper) + if "resnet50_trace_2cqs" in model_version: + run_trace_2cq_model( + device, + tt_inputs, + tt_resnet50, + inputs_mesh_mapper, + output_mesh_composer, + num_warmup_iterations, + num_measurement_iterations, + ) + elif "resnet50_2cqs" in model_version: + run_2cq_model( + device, + tt_inputs, + tt_resnet50, + inputs_mesh_mapper, + output_mesh_composer, + num_warmup_iterations, + num_measurement_iterations, + ) + elif "resnet50_trace" in model_version: + run_trace_model( + device, + tt_inputs, + tt_resnet50, + inputs_mesh_mapper, + output_mesh_composer, + num_warmup_iterations, + num_measurement_iterations, + ) + elif "resnet50" in model_version: + run_model( + device, + tt_inputs, + tt_resnet50, + inputs_mesh_mapper, + output_mesh_composer, + num_warmup_iterations, + num_measurement_iterations, + ) + else: + assert False, f"Model version to run {model_version} not found" + + first_iter_time = profiler.get(f"compile") + profiler.get(f"cache") + + # ensuring inference time fluctuations is not noise + inference_time_avg = profiler.get("run") / num_measurement_iterations + + cpu_time = profiler.get(cpu_key) + compile_time = first_iter_time - 2 * inference_time_avg + prep_perf_report( + model_name=f"ttnn_{model_version}_batch_size{batch_size}", + batch_size=batch_size, + inference_and_compile_time=first_iter_time, + inference_time=inference_time_avg, + expected_compile_time=expected_compile_time, + expected_inference_time=expected_inference_time, + comments=comments, + inference_time_cpu=cpu_time, + ) + + logger.info(f"{model_name} {comments} inference time (avg): {inference_time_avg}") + logger.info(f"{model_name} compile time: {compile_time}") + + +@run_for_wormhole_b0() +@pytest.mark.model_perf_t3000 +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) +@pytest.mark.parametrize( + "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time", + ( + (16, True, 0.0094, 60), + (16, False, 0.0230, 60), + ), + indirect=["enable_async_mode"], +) +def test_perf_t3000( + device_mesh, + use_program_cache, + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + enable_async_mode, + model_location_generator, +): + mode = "async" if enable_async_mode else "sync" + run_perf_resnet( + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device_mesh, + f"resnet50_{mode}", + model_location_generator, + ) + + +@run_for_wormhole_b0() +@pytest.mark.model_perf_t3000 +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1500000}], indirect=True) +@pytest.mark.parametrize( + "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time", + ( + (16, True, 0.0068, 60), + (16, False, 0.0111, 60), + ), + indirect=["enable_async_mode"], +) +def test_perf_trace_t3000( + device_mesh, + use_program_cache, + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + enable_async_mode, + model_location_generator, +): + mode = "async" if enable_async_mode else "sync" + run_perf_resnet( + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device_mesh, + f"resnet50_trace_{mode}", + model_location_generator, + ) + + +@run_for_wormhole_b0() +@pytest.mark.model_perf_t3000 +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_command_queues": 2}], indirect=True) +@pytest.mark.parametrize( + "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time", + ( + (16, True, 0.0105, 60), + (16, False, 0.0220, 60), + ), + indirect=["enable_async_mode"], +) +def test_perf_2cqs_t3000( + device_mesh, + use_program_cache, + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + enable_async_mode, + model_location_generator, +): + mode = "async" if enable_async_mode else "sync" + run_perf_resnet( + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device_mesh, + f"resnet50_2cqs_{mode}", + model_location_generator, + ) + + +@run_for_wormhole_b0() +@pytest.mark.model_perf_t3000 +@pytest.mark.parametrize( + "device_params", [{"l1_small_size": 32768, "num_command_queues": 2, "trace_region_size": 1332224}], indirect=True +) +@pytest.mark.parametrize( + "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time", + ( + (16, True, 0.0043, 60), + (16, False, 0.009, 60), + ), + indirect=["enable_async_mode"], +) +def test_perf_trace_2cqs_t3000( + device_mesh, + use_program_cache, + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + enable_async_mode, + model_location_generator, +): + mode = "async" if enable_async_mode else "sync" + run_perf_resnet( + device_batch_size, + expected_inference_time, + expected_compile_time, + hf_cat_image_sample_input, + device_mesh, + f"resnet50_trace_2cqs_{mode}", + model_location_generator, + ) diff --git a/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py b/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py index c0b836e44d9..8b6e972a86c 100644 --- a/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py +++ b/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py @@ -21,11 +21,44 @@ use_signpost = False +def create_event(device): + event = [] + if isinstance(device, ttnn.Device): + event.append(tt_lib.device.CreateEvent()) + else: + for dev in device.get_device_ids(): + event.append(tt_lib.device.CreateEvent()) + return event + + +def wait_for_event(device, cq_id, event): + if isinstance(device, ttnn.Device): + tt_lib.device.WaitForEvent(device, cq_id, event) + else: + for dev, eve in zip(device.get_device_ids(), event): + tt_lib.device.WaitForEvent(device.get_device(dev), cq_id, eve) + + +def record_event(device, cq_id, event): + if isinstance(device, ttnn.Device): + tt_lib.device.RecordEvent(device, cq_id, event) + else: + for dev, eve in zip(device.get_device_ids(), event): + tt_lib.device.RecordEvent(device.get_device(dev), cq_id, eve) + + +def buffer_address(tensor): + addr = [] + for ten in ttnn.get_device_tensors(tensor): + addr.append(ten.buffer_address()) + return addr + + # TODO: Create ttnn apis for these -ttnn.create_event = tt_lib.device.CreateEvent -ttnn.wait_for_event = tt_lib.device.WaitForEvent -ttnn.record_event = tt_lib.device.RecordEvent -ttnn.dump_device_profiler = tt_lib.device.DumpDeviceProfiler +ttnn.create_event = create_event +ttnn.wait_for_event = wait_for_event +ttnn.record_event = record_event +ttnn.buffer_address = buffer_address # TODO: Move these into Resnet model preprocessing/member functions @@ -84,7 +117,7 @@ def setup_dram_sharded_input(device, tt_inputs, tt_resnet50, mesh_mapper, mesh_c "device_batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), ) -@pytest.mark.parametrize("enable_async", [True, False]) +@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True) def test_run_resnet50_inference( device_mesh, use_program_cache, @@ -92,7 +125,7 @@ def test_run_resnet50_inference( act_dtype, weight_dtype, math_fidelity, - enable_async, + enable_async_mode, model_location_generator, ): if device_batch_size == 8: @@ -100,10 +133,6 @@ def test_run_resnet50_inference( if is_wormhole_b0() and device_batch_size == 20: pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue") - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(enable_async) - device_mesh.get_device(device).enable_program_cache() - inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0) weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh) output_mesh_composer = ttnn.ConcatMeshToTensor(device_mesh, dim=0) @@ -146,10 +175,6 @@ def test_run_resnet50_inference( signpost(header="stop") test_infra.validate() - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(False) - device_mesh.get_device(device).disable_and_clear_program_cache() - @run_for_wormhole_b0() @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True) @@ -157,7 +182,7 @@ def test_run_resnet50_inference( "device_batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), ) -@pytest.mark.parametrize("enable_async", [True, False]) +@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True) def test_run_resnet50_trace_inference( device_mesh, use_program_cache, @@ -165,7 +190,7 @@ def test_run_resnet50_trace_inference( act_dtype, weight_dtype, math_fidelity, - enable_async, + enable_async_mode, model_location_generator, ): if device_batch_size == 8: @@ -173,10 +198,6 @@ def test_run_resnet50_trace_inference( if is_wormhole_b0() and device_batch_size == 20: pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue") - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(enable_async) - device_mesh.get_device(device).enable_program_cache() - inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0) weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh) output_mesh_composer = ttnn.ConcatMeshToTensor(device_mesh, dim=0) @@ -230,19 +251,14 @@ def test_run_resnet50_trace_inference( signpost(header="stop") test_infra.validate() - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(False) - device_mesh.get_device(device).disable_and_clear_program_cache() - -@pytest.mark.skip() @run_for_wormhole_b0() -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_hw_cqs": 2}], indirect=True) +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_command_queues": 2}], indirect=True) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), ) -@pytest.mark.parametrize("enable_async", [True, False]) +@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True) def test_run_resnet50_2cqs_inference( device_mesh, use_program_cache, @@ -250,16 +266,13 @@ def test_run_resnet50_2cqs_inference( act_dtype, weight_dtype, math_fidelity, - enable_async, + enable_async_mode, model_location_generator, ): if device_batch_size == 8: pytest.skip("Skipping batch size 8 due to memory config issue") if is_wormhole_b0() and device_batch_size == 20: pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue") - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(enable_async) - device_mesh.get_device(device).enable_program_cache() inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0) weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh) @@ -287,8 +300,8 @@ def test_run_resnet50_2cqs_inference( output_mesh_composer, ) tt_image_res = tt_inputs_host.to(device_mesh, sharded_mem_config_DRAM) - op_event = ttnn.create_event() - write_event = ttnn.create_event() + op_event = ttnn.create_event(device_mesh) + write_event = ttnn.create_event(device_mesh) # Initialize the op event so we can write ttnn.record_event(device_mesh, 0, op_event) @@ -325,28 +338,23 @@ def test_run_resnet50_2cqs_inference( ttnn.record_event(device_mesh, 0, op_event) outputs.append(ttnn.from_device(test_infra.run(), blocking=False)) - ttnn.synchronize_devices(device) + ttnn.synchronize_devices(device_mesh) if use_signpost: signpost(header="stop") for output in outputs: test_infra.validate(output) - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(False) - device_mesh.get_device(device).disable_and_clear_program_cache() - -@pytest.mark.skip() @run_for_wormhole_b0() @pytest.mark.parametrize( - "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_hw_cqs": 2}], indirect=True + "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_command_queues": 2}], indirect=True ) @pytest.mark.parametrize( "device_batch_size, act_dtype, weight_dtype, math_fidelity", ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),), ) -@pytest.mark.parametrize("enable_async", [True, False]) +@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True) def test_run_resnet50_trace_2cqs_inference( device_mesh, use_program_cache, @@ -354,16 +362,13 @@ def test_run_resnet50_trace_2cqs_inference( act_dtype, weight_dtype, math_fidelity, - enable_async, + enable_async_mode, model_location_generator, ): if device_batch_size == 8: pytest.skip("Skipping batch size 8 due to memory config issue") if is_wormhole_b0() and device_batch_size == 20: pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue") - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(enable_async) - device_mesh.get_device(device).enable_program_cache() inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0) weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh) @@ -391,8 +396,8 @@ def test_run_resnet50_trace_2cqs_inference( output_mesh_composer, ) tt_image_res = tt_inputs_host.to(device_mesh, sharded_mem_config_DRAM) - op_event = ttnn.create_event() - write_event = ttnn.create_event() + op_event = ttnn.create_event(device_mesh) + write_event = ttnn.create_event(device_mesh) # Initialize the op event so we can write ttnn.record_event(device_mesh, 0, op_event) @@ -412,7 +417,7 @@ def test_run_resnet50_trace_2cqs_inference( ttnn.record_event(device_mesh, 1, write_event) ttnn.wait_for_event(device_mesh, 0, write_event) test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, input_mem_config) - first_out_addr = test_infra.input_tensor.buffer_address() + first_out_addr = ttnn.buffer_address(test_infra.input_tensor) ttnn.record_event(device_mesh, 0, op_event) test_infra.run() test_infra.validate() @@ -434,7 +439,7 @@ def test_run_resnet50_trace_2cqs_inference( input_mem_config, ) ttnn.end_trace_capture(device_mesh, tid, cq_id=0) - assert first_out_addr == test_infra.input_tensor.buffer_address() + assert first_out_addr == ttnn.buffer_address(test_infra.input_tensor) test_infra.validate() # More optimized run with caching @@ -452,19 +457,10 @@ def test_run_resnet50_trace_2cqs_inference( ) ttnn.record_event(device_mesh, 0, op_event) ttnn.execute_trace(device_mesh, tid, cq_id=0, blocking=False) - outputs.append( - ttnn.from_device( - test_infra.output_tensor, device=device_mesh, mesh_composer=output_mesh_composer, blocking=False - ) - ) - + outputs.append(ttnn.from_device(test_infra.output_tensor, blocking=False)) ttnn.synchronize_devices(device_mesh) if use_signpost: signpost(header="stop") for output in outputs: test_infra.validate(output) - - for device in device_mesh.get_device_ids(): - device_mesh.get_device(device).enable_async(False) - device_mesh.get_device(device).disable_and_clear_program_cache() diff --git a/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py index 7a84e96d980..df7fcf64bf5 100644 --- a/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py +++ b/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py @@ -4,10 +4,10 @@ import pytest from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report -from models.utility_functions import skip_for_grayskull +from models.utility_functions import run_for_wormhole_b0 -@skip_for_grayskull(reason_str="Untested for Grayskull") +@run_for_wormhole_b0() @pytest.mark.models_device_performance_bare_metal @pytest.mark.parametrize( "batch_size, test, expected_perf", diff --git a/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py index a133e3c1314..ee533095d44 100644 --- a/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py +++ b/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py @@ -16,7 +16,7 @@ from models.utility_functions import ( profiler, disable_persistent_kernel_cache, - skip_for_grayskull, + run_for_wormhole_b0, ) from models.perf.perf_utils import prep_perf_report @@ -69,7 +69,7 @@ def run_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_measure _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True) ttnn.dump_device_profiler(device) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) if use_signpost: signpost(header="start") outputs = [] @@ -77,7 +77,7 @@ def run_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_measure for iter in range(0, num_measurement_iterations): tt_inputs = tt_inputs_host.to(device, input_mem_config) outputs.append(ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=False)) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) profiler.end(f"run") if use_signpost: signpost(header="stop") @@ -125,7 +125,7 @@ def run_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_mea _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True) ttnn.dump_device_profiler(device) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) if use_signpost: signpost(header="start") outputs = [] @@ -138,7 +138,7 @@ def run_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_mea reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config) ttnn.record_event(device, 0, op_event) outputs.append(ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=False)) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) profiler.end(f"run") if use_signpost: signpost(header="stop") @@ -178,7 +178,7 @@ def run_trace_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_m _ = ttnn.from_device(tt_output_res, blocking=True) ttnn.dump_device_profiler(device) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) if use_signpost: signpost(header="start") outputs = [] @@ -187,7 +187,7 @@ def run_trace_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_m ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res) ttnn.execute_trace(device, tid, cq_id=0, blocking=False) outputs.append(ttnn.from_device(tt_output_res, blocking=False)) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) profiler.end(f"run") if use_signpost: signpost(header="stop") @@ -255,7 +255,7 @@ def run_trace_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, n ttnn.execute_trace(device, tid, cq_id=0, blocking=True) ttnn.dump_device_profiler(device) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) if use_signpost: signpost(header="start") outputs = [] @@ -270,7 +270,7 @@ def run_trace_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, n ttnn.record_event(device, 0, op_event) ttnn.execute_trace(device, tid, cq_id=0, blocking=False) outputs.append(tt_output_res.cpu(blocking=False)) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) profiler.end(f"run") if use_signpost: signpost(header="stop") @@ -322,7 +322,7 @@ def run_perf_resnet( dealloc_input=True, final_output_mem_config=ttnn.DRAM_MEMORY_CONFIG if "trace" in model_version else ttnn.L1_MEMORY_CONFIG, ) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) num_warmup_iterations = 5 num_measurement_iterations = 15 @@ -366,9 +366,9 @@ def run_perf_resnet( logger.info(f"{model_name} compile time: {compile_time}") -@skip_for_grayskull(reason_str="Untested for Grayskull") -@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) +@run_for_wormhole_b0() @pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) @pytest.mark.parametrize( "batch_size, expected_inference_time, expected_compile_time", ((16, 0.006, 25),), @@ -393,15 +393,16 @@ def test_perf_bare_metal( ) -@skip_for_grayskull(reason_str="Untested for Grayskull") -@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1500000}], indirect=True) +@run_for_wormhole_b0() @pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1500000}], indirect=True) @pytest.mark.parametrize( - "batch_size, enable_async, expected_inference_time, expected_compile_time", + "batch_size, enable_async_mode, expected_inference_time, expected_compile_time", ( (16, True, 0.005, 25), (16, False, 0.0046, 25), ), + indirect=["enable_async_mode"], ) def test_perf_trace_bare_metal( device, @@ -410,11 +411,10 @@ def test_perf_trace_bare_metal( expected_inference_time, expected_compile_time, hf_cat_image_sample_input, - enable_async, + enable_async_mode, model_location_generator, ): - device.enable_async(enable_async) - mode = "async" if enable_async else "sync" + mode = "async" if enable_async_mode else "sync" run_perf_resnet( batch_size, expected_inference_time, @@ -424,12 +424,11 @@ def test_perf_trace_bare_metal( f"resnet50_trace_{mode}", model_location_generator, ) - device.enable_async(False) -@skip_for_grayskull(reason_str="Untested for Grayskull") -@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2}], indirect=True) +@run_for_wormhole_b0() @pytest.mark.models_performance_bare_metal +@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2}], indirect=True) @pytest.mark.parametrize( "batch_size, expected_inference_time, expected_compile_time", ((16, 0.0064, 25),), @@ -454,11 +453,11 @@ def test_perf_2cqs_bare_metal( ) -@skip_for_grayskull(reason_str="Untested for Grayskull") +@run_for_wormhole_b0() +@pytest.mark.models_performance_bare_metal @pytest.mark.parametrize( "device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2, "trace_region_size": 1332224}], indirect=True ) -@pytest.mark.models_performance_bare_metal @pytest.mark.parametrize( "batch_size, expected_inference_time, expected_compile_time", ((16, 0.004, 25),), diff --git a/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py b/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py index 57bf4dd5833..bc1ffff7649 100644 --- a/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py +++ b/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py @@ -25,7 +25,6 @@ ttnn.create_event = tt_lib.device.CreateEvent ttnn.wait_for_event = tt_lib.device.WaitForEvent ttnn.record_event = tt_lib.device.RecordEvent -ttnn.dump_device_profiler = tt_lib.device.DumpDeviceProfiler # TODO: Move these into Resnet model preprocessing/member functions @@ -249,7 +248,7 @@ def test_run_resnet50_2cqs_inference( test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, input_mem_config) ttnn.record_event(device, 0, op_event) outputs.append(ttnn.from_device(test_infra.run(), blocking=False)) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) if use_signpost: signpost(header="stop") for output in outputs: @@ -347,7 +346,7 @@ def test_run_resnet50_trace_2cqs_inference( if use_signpost: signpost(header="start") outputs = [] - for iter in range(0, 1): + for iter in range(0, 2): ttnn.wait_for_event(device, 1, op_event) ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1) ttnn.record_event(device, 1, write_event) @@ -360,7 +359,7 @@ def test_run_resnet50_trace_2cqs_inference( ttnn.execute_trace(device, tid, cq_id=0, blocking=False) outputs.append(ttnn.from_device(test_infra.output_tensor, blocking=False)) - ttnn.device.synchronize_device(device) + ttnn.synchronize_device(device) if use_signpost: signpost(header="stop") for output in outputs: diff --git a/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py b/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py index b83f3cfed31..9aa74461298 100644 --- a/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py +++ b/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py @@ -113,7 +113,7 @@ def load_resnet50_model(model_location_generator): ttnn.MathFidelity.LoFi, ttnn.bfloat8_b, ttnn.bfloat8_b, - ): 0.884609, # Max ATOL Delta: 6.455164909362793, Max RTOL Delta: inf, PCC: 0.8846098380419435 + ): 0.988, # Max ATOL Delta: 6.455164909362793, Max RTOL Delta: inf, PCC: 0.8846098380419435 }, 20: { ( @@ -135,6 +135,19 @@ def load_resnet50_model(model_location_generator): }, } +golden_pcc = { + ttnn.device.Arch.WORMHOLE_B0: golden_pcc, + ttnn.device.Arch.GRAYSKULL: golden_pcc, +} + +golden_pcc[ttnn.device.Arch.GRAYSKULL][16][ + ( + ttnn.MathFidelity.LoFi, + ttnn.bfloat8_b, + ttnn.bfloat8_b, + ) +] = 0.936 + class ResNet50TestInfra: def __init__( @@ -227,7 +240,9 @@ def validate(self, output_tensor=None): valid_pcc = 1.0 if self.batch_size >= 8: - valid_pcc = golden_pcc[self.batch_size][(self.math_fidelity, self.weight_dtype, self.act_dtype)] + valid_pcc = golden_pcc[self.device.arch()][self.batch_size][ + (self.math_fidelity, self.weight_dtype, self.act_dtype) + ] else: if self.act_dtype == ttnn.bfloat8_b: if self.math_fidelity == ttnn.MathFidelity.LoFi: @@ -239,7 +254,6 @@ def validate(self, output_tensor=None): valid_pcc = 0.93 else: valid_pcc = 0.982 - print(valid_pcc) self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor, output_tensor, pcc=valid_pcc) logger.info( diff --git a/models/perf/perf_utils.py b/models/perf/perf_utils.py index 92cc1cc9bf5..3f8e4edfc32 100644 --- a/models/perf/perf_utils.py +++ b/models/perf/perf_utils.py @@ -45,7 +45,7 @@ def merge_perf_files(fname, perf_fname, expected_cols): def process_perf_results(fname, expected_cols): with open(fname) as file: - merge_res = csv.reader(file) + merge_res = csv.reader(file, skipinitialspace=True) logger.info(next(merge_res)[0].strip()) logger.info(next(merge_res)[0].strip()) cols = next(merge_res) diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh index 3ff50232deb..b96a360ce86 100755 --- a/tests/scripts/t3000/run_t3000_frequent_tests.sh +++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh @@ -124,6 +124,24 @@ run_t3000_falcon40b_tests() { fi } +run_t3000_resnet_tests() { + fail=0 + # Record the start time + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_resnet_tests" + + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py ; fail+=$? + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_resnet_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_t3000_tests() { # Run ethernet tests run_t3000_ethernet_tests @@ -143,6 +161,9 @@ run_t3000_tests() { # Run mixtral tests run_t3000_mixtral_tests + # Run resnet tests + run_t3000_resnet_tests + # Run trace tests run_t3000_trace_stress_tests diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 4af78368f90..27b5a0c3ce4 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -74,6 +74,24 @@ run_t3000_falcon40b_tests() { fi } +run_t3000_resnet50_tests() { + # Record the start time + fail=0 + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_resnet50_tests" + + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py -m "model_perf_t3000" ; fail+=$? + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_resnet50_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_t3000_llm_tests() { # Run falcon7b tests run_t3000_falcon7b_tests @@ -92,6 +110,9 @@ run_t3000_llm_tests() { } run_t3000_cnn_tests() { + # Run resnet50 tests + run_t3000_resnet50_tests + # Merge all the generated reports env python models/perf/merge_perf_results.py } diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp index c8830d081ee..27b77624921 100644 --- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp +++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp @@ -192,9 +192,9 @@ struct Reshard { inline Tensor reshard(const Tensor &input_tensor, const MemoryConfig &output_mem_config, std::optional output_tensor = std::nullopt) { std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; operation::launch_op( - [output_mem_config, output_tensor] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { + [output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor = input_tensors.at(0); - return operation::run(Reshard{.output_mem_config = output_mem_config,}, {input_tensor}, {}, {output_tensor}); + return operation::run(Reshard{.output_mem_config = output_mem_config,}, {input_tensor}, {}, {optional_output_tensors}); }, {input_tensor}, output_tensors, {}, {output_tensor}); return output_tensors.at(0); }