From ec0cc14c288af65a93888da7975edaf3b4c8e7ba Mon Sep 17 00:00:00 2001
From: Austin Ho <109362939+tt-aho@users.noreply.github.com>
Date: Mon, 5 Aug 2024 13:29:46 -0400
Subject: [PATCH] Enable T3K Resnet Tests (#11030)

* #10244: Fix optional output tensor handling for reshard

* #0: Add enable_async_mode device fixture and refactor use_program_cache

* #0: Cleanup ttnn_resnet single device test files

* #10244: Fix multi-device api issues for ttnn resnet tests, and add them to ci

* #10244: Add E2E performance tests for ttnn_resnet on t3000

* #10244: Add t3000 perf results for ttnn_resnet to README

* #0: Remove initial space when parsing perf csv

* #0: Increase timeout for Nightly N300 WH-only models job due to some ci machines being slower than others
---
 ...-dispatch-full-regressions-and-models.yaml |   2 +-
 .github/workflows/t3000-frequent-tests.yaml   |  16 +-
 .github/workflows/t3000-model-perf-tests.yaml |   2 +
 README.md                                     |  16 +-
 conftest.py                                   |  49 +-
 models/demos/ttnn_resnet/README.md            |   5 +
 .../multi_device/test_perf_ttnn_resnet.py     | 599 ++++++++++++++++++
 .../test_ttnn_resnet50_performant.py          | 114 ++--
 .../tests/test_perf_device_ttnn_resnet.py     |   4 +-
 .../tests/test_perf_ttnn_resnet.py            |  45 +-
 .../tests/test_ttnn_resnet50_performant.py    |   7 +-
 .../tests/ttnn_resnet_test_infra.py           |  20 +-
 models/perf/perf_utils.py                     |   2 +-
 .../scripts/t3000/run_t3000_frequent_tests.sh |  21 +
 .../t3000/run_t3000_model_perf_tests.sh       |  21 +
 .../tt_dnn/op_library/sharded/sharded_op.hpp  |   4 +-
 16 files changed, 798 insertions(+), 129 deletions(-)
 create mode 100644 models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py

diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
index b2b9ebb3cf1..796065f4844 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
@@ -23,7 +23,7 @@ jobs:
             { name: "Common models N300 WH B0", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_common_models.sh, timeout: 40 },
             { name: "GS-only ttnn nightly", arch: grayskull, cmd: tests/scripts/single_card/nightly/run_ttnn.sh, timeout: 40 },
             { name: "GS-only models", arch: grayskull, cmd: tests/scripts/single_card/nightly/run_gs_only.sh, timeout: 40 },
-            { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 40 },
+            { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 50 },
             { name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
             { name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
             # #9945: Skip SD for now
diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml
index b385aa9da51..2e53c1de2ca 100644
--- a/.github/workflows/t3000-frequent-tests.yaml
+++ b/.github/workflows/t3000-frequent-tests.yaml
@@ -17,18 +17,20 @@ jobs:
       fail-fast: false
       matrix:
         test-group: [
-          { name: "t3k tteager tests", arch: wormhole_b0, cmd: run_t3000_tteager_tests, timeout: 60, 
+          { name: "t3k tteager tests", arch: wormhole_b0, cmd: run_t3000_tteager_tests, timeout: 60,
           runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: ULMEPM2MA}, #Sean Nijjar
-          { name: "t3k ethernet tests", arch: wormhole_b0, cmd: run_t3000_ethernet_tests, timeout: 60, 
+          { name: "t3k ethernet tests", arch: wormhole_b0, cmd: run_t3000_ethernet_tests, timeout: 60,
           runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: ULMEPM2MA}, #Sean Nijjar
-          { name: "t3k trace stress tests", arch: wormhole_b0, cmd: run_t3000_trace_stress_tests, timeout: 120, 
+          { name: "t3k trace stress tests", arch: wormhole_b0, cmd: run_t3000_trace_stress_tests, timeout: 120,
           runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U03NG0A5ND7}, #Aditya Saigal
-          { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 120, 
-          runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U04S2UV6L8N}, #Sofija Jovic 
-          { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, 
+          { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 120,
+          runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U04S2UV6L8N}, #Sofija Jovic
+          { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60,
           runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U03FJB5TM5Y}, #Colman Glagovich
-          { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, 
+          { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60,
           runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U03PUAKE719}, #Miguel Tairum Cruz
+          { name: "t3k resnet tests", arch: wormhole_b0, cmd: run_t3000_resnet_tests, timeout: 30,
+          runs-on: ["config-t3000", "in-service", "pipeline-functional"], owner_id: U013121KDH9}, #Austin Ho
         ]
     name: ${{ matrix.test-group.name }}
     env:
diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml
index f6f5cae00a4..17599d68240 100644
--- a/.github/workflows/t3000-model-perf-tests.yaml
+++ b/.github/workflows/t3000-model-perf-tests.yaml
@@ -25,6 +25,8 @@ jobs:
             runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], owner_id: U03FJB5TM5Y}, # Colman Glagovich
           { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75,
             runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], owner_id: U053W15B6JF}, # Djordje Ivanovic
+          { name: "t3k LLM resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75,
+            runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], owner_id: U013121KDH9}, # Austin Ho
           #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
         ]
     name: ${{ matrix.test-group.name }}
diff --git a/README.md b/README.md
index e366f098ac0..d9984b58fcc 100644
--- a/README.md
+++ b/README.md
@@ -44,12 +44,12 @@
 >
 > Furthermore, all performance numbers here are run or based off an N300 Wormhole card.
 
-| Model                                                                                  | Last Verified Release                                                            | Gen. Token [3]     |  Batch               | End-to-end throughput [1]      | Device throughput [2]        | Target         |
+| Model                                                                                  | Last Verified Release                                                     | Gen. Token [3]     |  Batch               | End-to-end throughput [1]      | Device throughput [2]        | Target         |
 |----------------------------------------------------------------------------------------|---------------------------------------------------------------------------|--------------------|----------------------|--------------------------------|------------------------------|----------------|
-| [Falcon7B](./models/demos/wormhole/falcon7b)                                           | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th              | 32                   | 13.7 t/s/u - 438 t/s          | 19.5 t/s/u - 624 t/s        | 26             |
-| [Mistral-7B](./models/demos/wormhole/mistral7b)                                        | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th              | 32                   | 9.9 t/s/u - 317 t/s           | 11.0 t/s/u - 352 t/s         | 25             |
+| [Falcon7B](./models/demos/wormhole/falcon7b)                                           | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th              | 32                   | 13.7 t/s/u - 438 t/s           | 19.5 t/s/u - 624 t/s         | 26             |
+| [Mistral-7B](./models/demos/wormhole/mistral7b)                                        | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th              | 32                   | 9.9 t/s/u - 317 t/s            | 11.0 t/s/u - 352 t/s         | 25             |
 | [Mamba-2.8B](./models/demos/wormhole/mamba)                                            | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | any                | 32                   | 11.6 t/s/u - 371 t/s           | 16.5 t/s/u - 528 t/s         | 41             |
-| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b)                                     | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th              | 8                    | 8.3 t/s/u - 66.0 t/s     | 9.7 t/s/u - 77.9 t/s   | 23             |
+| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b)                                     | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | 129th              | 8                    | 8.3 t/s/u - 66.0 t/s           | 9.7 t/s/u - 77.9 t/s         | 23             |
 | [BERT-Large](./models/demos/metal_BERT_large_11/) (sen/s) [4]                          |                                                                           |                    | 8                    | 270                            | 340                          | 400            |
 | [Stable Diffusion 1.4](./models/demos/wormhole/stable_diffusion) 512x512 (sec/img) [5] |                                                                           |                    | 1                    | 6                              | 5                            | 3              |
 | [ResNet-50](./models/demos/ttnn_resnet) (fps)                                          |                                                                           |                    | 16                   | 4,300                          | 5,550                        | 7,000          |
@@ -66,14 +66,14 @@
 
 ##  TT-QuietBox & TT-LoudBox (2x4 mesh of WHs) Models
 
-| Model                                              | Last Verified Release                                                            |   Technique        | Gen. Token [3]      |  Batch                | End-to-end throughput [1]    | Device throughput [2]        | Target          |
+| Model                                              | Last Verified Release                                                     |   Technique        | Gen. Token [3]      |  Batch                | End-to-end throughput [1]    | Device throughput [2]        | Target          |
 |----------------------------------------------------|---------------------------------------------------------------------------|--------------------|---------------------|-----------------------|------------------------------|------------------------------|-----------------|
-| [Falcon7B](./models/demos/t3000/falcon7b)          | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Data Parallel      | 129th               |  256                  | 7.6 t/s/u - 1950 t/s        |  19.5 t/s/u - 4990 t/s       |   26 t/s/u      |
+| [Falcon7B](./models/demos/t3000/falcon7b)          | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Data Parallel      | 129th               |  256                  | 7.6 t/s/u - 1950 t/s         |  19.5 t/s/u - 4990 t/s       |   26 t/s/u      |
 | [LLaMA-2-70B](./models/demos/t3000/llama2_70b)     | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel    | 129th               |  32                   | 10.4 t/s/u - 333 t/s         |  16.6 t/s/u - 531 t/s        |   20 t/s/u      |
 | [LLaMA-3.1-70B](./models/demos/t3000/llama3_70b)   | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel    | 129th               |  32                   | 10.4 t/s/u - 333 t/s         |  15.8 t/s/u - 506 t/s        |   20 t/s/u      |
-| [Falcon40B](./models/demos/t3000/falcon40b)        | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel    | 129th               |  32                   | 5.3 t/s/u - 168 t/s         |  12.2 t/s/u - 390 t/s       |   36 t/s/u      |
+| [Falcon40B](./models/demos/t3000/falcon40b)        | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel    | 129th               |  32                   | 5.3 t/s/u - 168 t/s          |  12.2 t/s/u - 390 t/s        |   36 t/s/u      |
 | [Mixtral7Bx8](./models/demos/t3000/mixtral8x7b)    | [v0.51.0-rc13](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc13) | Tensor Parallel    | 129th               |  32                   | 13.3 t/s/u - 426 t/s         |  21.4 t/s/u - 685 t/s        |   33 t/s/u      |
-| ResNet50                                           |                                                                           | Data Parallel      | coming soon         |                       |                              |                              |                 |
+| [ResNet-50](./models/demos/ttnn_resnet)            |                                                                           | Data Parallel      |                     |  128                  | 31,700                       |  44,400                      |   56,000        |
 
 ## Model Updates
 For the latest model updates and features, please see [MODEL_UPDATES.md](models/MODEL_UPDATES.md)
diff --git a/conftest.py b/conftest.py
index 597ae033b8f..06aa9ccfd4c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -271,36 +271,47 @@ def reset_default_device():
     ttl.device.SetDefaultDevice(device)
 
 
-@pytest.fixture(scope="function")
-def use_program_cache(request):
-    import tt_lib as ttl
-
+def get_devices(request):
     if "device" in request.fixturenames:
-        dev = request.getfixturevalue("device")
-        dev.enable_program_cache()
+        devices = [request.getfixturevalue("device")]
     elif "all_devices" in request.fixturenames:
         devices = request.getfixturevalue("all_devices")
-        for dev in devices:
-            dev.enable_program_cache()
     elif "pcie_devices" in request.fixturenames:
         devices = request.getfixturevalue("pcie_devices")
-        for dev in devices:
-            dev.enable_program_cache()
     elif "device_mesh" in request.fixturenames:
-        mesh = request.getfixturevalue("device_mesh")
-        for device_id in mesh.get_device_ids():
-            mesh.get_device(device_id).enable_program_cache()
+        devices = request.getfixturevalue("device_mesh").get_devices()
     elif "t3k_device_mesh" in request.fixturenames:
-        mesh = request.getfixturevalue("t3k_device_mesh")
-        for device_id in mesh.get_device_ids():
-            mesh.get_device(device_id).enable_program_cache()
+        devices = request.getfixturevalue("t3k_device_mesh").get_devices()
     elif "pcie_device_mesh" in request.fixturenames:
-        mesh = request.getfixturevalue("pcie_device_mesh")
-        for device_id in mesh.get_device_ids():
-            mesh.get_device(device_id).enable_program_cache()
+        devices = request.getfixturevalue("pcie_device_mesh").get_devices()
     else:
+        devices = []
+    return devices
+
+
+@pytest.fixture(scope="function")
+def use_program_cache(request):
+    devices = get_devices(request)
+    if not devices:
         logger.warning("No device fixture found to apply program cache to: PROGRAM CACHE DISABLED")
+    for dev in devices:
+        dev.enable_program_cache()
     yield
+    for dev in devices:
+        dev.disable_and_clear_program_cache()
+
+
+@pytest.fixture(scope="function")
+def enable_async_mode(request):
+    devices = get_devices(request)
+    if not devices:
+        logger.warning("No device fixture found to apply async mode to: ASYNC MODE DISABLED")
+
+    for dev in devices:
+        dev.enable_async(request.param)
+    yield request.param
+    for dev in devices:
+        dev.enable_async(False)
 
 
 @pytest.fixture(scope="function")
diff --git a/models/demos/ttnn_resnet/README.md b/models/demos/ttnn_resnet/README.md
index 751f9f7d115..d14673ff3e2 100644
--- a/models/demos/ttnn_resnet/README.md
+++ b/models/demos/ttnn_resnet/README.md
@@ -8,8 +8,13 @@ Our ImageProcessor on the other hand is based on `microsoft/resnet-50` from hugg
 
 ## Performance
 
+### Single Device
 + To obtain device performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./tt_metal/tools/profiler/profile_this.py -c "pytest models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py::test_run_resnet50_inference[16-act_dtype0-weight_dtype0-math_fidelity0-device_params0]"`
 This will generate a CSV report under `<this repo dir>/generated/profiler/reports/ops/<report name>`. The report file name is logged in the run output.
 
 + For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py::test_perf_trace_2cqs_bare_metal[16-0.004-25-device_params0]`. This will generate a CSV with the timings and throughputs.
 Expected end-to-end perf: For batch = 16, it is about `4300 fps` currently. This may vary machine to machine.
+
+### T3000
++ For end-to-end performance, run `WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest test_perf_trace_2cqs_t3000[wormhole_b0-True-16-True-0.0043-60-device_params0]`. This will generate a CSV with the timings and throughputs.
+Expected end-to-end perf: For batch = 16 per device, or batch 128 in total, it is about `31,700 fps` currently. This may vary machine to machine.
diff --git a/models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py
new file mode 100644
index 00000000000..72e08d03921
--- /dev/null
+++ b/models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py
@@ -0,0 +1,599 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from loguru import logger
+from transformers import AutoImageProcessor
+import pytest
+import ttnn
+import tt_lib
+from ttnn.model_preprocessing import (
+    preprocess_model_parameters,
+)
+
+from models.utility_functions import (
+    profiler,
+    disable_persistent_kernel_cache,
+    run_for_wormhole_b0,
+)
+
+from models.perf.perf_utils import prep_perf_report
+
+from models.demos.ttnn_resnet.tests.multi_device.test_ttnn_resnet50_performant import (
+    setup_l1_sharded_input,
+    setup_dram_sharded_input,
+)
+from models.demos.ttnn_resnet.tests.ttnn_resnet_test_infra import load_resnet50_model
+from models.demos.ttnn_resnet.tt.custom_preprocessing import create_custom_mesh_preprocessor
+from models.demos.ttnn_resnet.tt.ttnn_functional_resnet50_new_conv_api import resnet50
+
+try:
+    from tracy import signpost
+
+    use_signpost = True
+except ModuleNotFoundError:
+    use_signpost = False
+
+
+def create_event(device):
+    event = []
+    if isinstance(device, ttnn.Device):
+        event.append(tt_lib.device.CreateEvent())
+    else:
+        for dev in device.get_device_ids():
+            event.append(tt_lib.device.CreateEvent())
+    return event
+
+
+def wait_for_event(device, cq_id, event):
+    if isinstance(device, ttnn.Device):
+        tt_lib.device.WaitForEvent(device, cq_id, event)
+    else:
+        for dev, eve in zip(device.get_device_ids(), event):
+            tt_lib.device.WaitForEvent(device.get_device(dev), cq_id, eve)
+
+
+def record_event(device, cq_id, event):
+    if isinstance(device, ttnn.Device):
+        tt_lib.device.RecordEvent(device, cq_id, event)
+    else:
+        for dev, eve in zip(device.get_device_ids(), event):
+            tt_lib.device.RecordEvent(device.get_device(dev), cq_id, eve)
+
+
+def buffer_address(tensor):
+    addr = []
+    for ten in ttnn.get_device_tensors(tensor):
+        addr.append(ten.buffer_address())
+    return addr
+
+
+def dump_device_profiler(device):
+    if isinstance(device, ttnn.Device):
+        tt_lib.device.DumpDeviceProfiler(device)
+    else:
+        for dev in device.get_device_ids():
+            tt_lib.device.DumpDeviceProfiler(device.get_device(dev))
+
+
+# TODO: Create ttnn apis for these
+ttnn.create_event = create_event
+ttnn.wait_for_event = wait_for_event
+ttnn.record_event = record_event
+ttnn.buffer_address = buffer_address
+ttnn.dump_device_profiler = dump_device_profiler
+
+model_config = {
+    "MATH_FIDELITY": ttnn.MathFidelity.LoFi,
+    "WEIGHTS_DTYPE": ttnn.bfloat8_b,
+    "ACTIVATIONS_DTYPE": ttnn.bfloat8_b,
+}
+
+
+def run_model(
+    device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations
+):
+    ops_parallel_config = {}
+    tt_inputs_host, input_mem_config = setup_l1_sharded_input(
+        device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer
+    )
+    profiler.start("compile")
+    tt_inputs = tt_inputs_host.to(device, input_mem_config)
+    _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True)
+    profiler.end("compile")
+    ttnn.dump_device_profiler(device)
+
+    profiler.start("cache")
+    tt_inputs = tt_inputs_host.to(device, input_mem_config)
+    _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True)
+    profiler.end("cache")
+    ttnn.dump_device_profiler(device)
+
+    for iter in range(0, num_warmup_iterations):
+        tt_inputs = tt_inputs_host.to(device, input_mem_config)
+        _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True)
+        ttnn.dump_device_profiler(device)
+
+    ttnn.synchronize_devices(device)
+    if use_signpost:
+        signpost(header="start")
+    outputs = []
+    profiler.start(f"run")
+    for iter in range(0, num_measurement_iterations):
+        tt_inputs = tt_inputs_host.to(device, input_mem_config)
+        outputs.append(ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=False))
+    ttnn.synchronize_devices(device)
+    profiler.end(f"run")
+    if use_signpost:
+        signpost(header="stop")
+    ttnn.dump_device_profiler(device)
+
+
+def run_2cq_model(
+    device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations
+):
+    ops_parallel_config = {}
+    tt_inputs_host, sharded_mem_config_DRAM, input_mem_config = setup_dram_sharded_input(
+        device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer
+    )
+    tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM)
+    op_event = ttnn.create_event(device)
+    write_event = ttnn.create_event(device)
+    # Initialize the op event so we can write
+    ttnn.record_event(device, 0, op_event)
+
+    profiler.start("compile")
+    ttnn.wait_for_event(device, 1, op_event)
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+    ttnn.record_event(device, 1, write_event)
+    ttnn.wait_for_event(device, 0, write_event)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    ttnn.record_event(device, 0, op_event)
+    _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
+    profiler.end("compile")
+    ttnn.dump_device_profiler(device)
+
+    profiler.start("cache")
+    ttnn.wait_for_event(device, 1, op_event)
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+    ttnn.record_event(device, 1, write_event)
+    ttnn.wait_for_event(device, 0, write_event)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    ttnn.record_event(device, 0, op_event)
+    _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
+    profiler.end("cache")
+    ttnn.dump_device_profiler(device)
+
+    for iter in range(0, num_warmup_iterations):
+        ttnn.wait_for_event(device, 1, op_event)
+        ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+        ttnn.record_event(device, 1, write_event)
+        ttnn.wait_for_event(device, 0, write_event)
+        reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+        ttnn.record_event(device, 0, op_event)
+        _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
+        ttnn.dump_device_profiler(device)
+
+    ttnn.synchronize_devices(device)
+    if use_signpost:
+        signpost(header="start")
+    outputs = []
+    profiler.start(f"run")
+    for iter in range(0, num_measurement_iterations):
+        ttnn.wait_for_event(device, 1, op_event)
+        ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+        ttnn.record_event(device, 1, write_event)
+        ttnn.wait_for_event(device, 0, write_event)
+        reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+        ttnn.record_event(device, 0, op_event)
+        outputs.append(ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=False))
+    ttnn.synchronize_devices(device)
+    profiler.end(f"run")
+    if use_signpost:
+        signpost(header="stop")
+    ttnn.dump_device_profiler(device)
+
+
+def run_trace_model(
+    device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations
+):
+    ops_parallel_config = {}
+    tt_inputs_host, sharded_mem_config_DRAM, input_mem_config = setup_dram_sharded_input(
+        device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer
+    )
+    tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM)
+    # Compile
+    profiler.start("compile")
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
+    profiler.end("compile")
+    ttnn.dump_device_profiler(device)
+
+    profiler.start("cache")
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
+    profiler.end("cache")
+    ttnn.dump_device_profiler(device)
+
+    # Capture
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res)
+    tid = ttnn.begin_trace_capture(device, cq_id=0)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    tt_output_res = tt_resnet50(reshard_out, device, ops_parallel_config)
+    ttnn.end_trace_capture(device, tid, cq_id=0)
+    ttnn.dump_device_profiler(device)
+
+    for iter in range(0, num_warmup_iterations):
+        ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res)
+        ttnn.execute_trace(device, tid, cq_id=0, blocking=False)
+        _ = ttnn.from_device(tt_output_res, blocking=True)
+        ttnn.dump_device_profiler(device)
+
+    ttnn.synchronize_devices(device)
+    if use_signpost:
+        signpost(header="start")
+    outputs = []
+    profiler.start(f"run")
+    for iter in range(0, num_measurement_iterations):
+        ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res)
+        ttnn.execute_trace(device, tid, cq_id=0, blocking=False)
+        outputs.append(ttnn.from_device(tt_output_res, blocking=False))
+    ttnn.synchronize_devices(device)
+    profiler.end(f"run")
+    if use_signpost:
+        signpost(header="stop")
+    ttnn.dump_device_profiler(device)
+
+
+def run_trace_2cq_model(
+    device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer, num_warmup_iterations, num_measurement_iterations
+):
+    ops_parallel_config = {}
+    tt_inputs_host, sharded_mem_config_DRAM, input_mem_config = setup_dram_sharded_input(
+        device, tt_inputs, tt_resnet50, mesh_mapper, mesh_composer
+    )
+    tt_image_res = tt_inputs_host.to(device, sharded_mem_config_DRAM)
+
+    op_event = ttnn.create_event(device)
+    write_event = ttnn.create_event(device)
+    # Initialize the op event so we can write
+    ttnn.record_event(device, 0, op_event)
+
+    profiler.start("compile")
+    ttnn.wait_for_event(device, 1, op_event)
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+    ttnn.record_event(device, 1, write_event)
+    ttnn.wait_for_event(device, 0, write_event)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    ttnn.record_event(device, 0, op_event)
+    _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
+    profiler.end("compile")
+    ttnn.dump_device_profiler(device)
+
+    profiler.start("cache")
+    ttnn.wait_for_event(device, 1, op_event)
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+    ttnn.record_event(device, 1, write_event)
+    ttnn.wait_for_event(device, 0, write_event)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    first_out_addr = ttnn.buffer_address(reshard_out)
+    ttnn.record_event(device, 0, op_event)
+    _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
+    profiler.end("cache")
+    ttnn.dump_device_profiler(device)
+
+    # Capture
+    ttnn.wait_for_event(device, 1, op_event)
+    ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+    ttnn.record_event(device, 1, write_event)
+
+    ttnn.wait_for_event(device, 0, write_event)
+    reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
+    ttnn.record_event(device, 0, op_event)
+
+    tid = ttnn.begin_trace_capture(device, cq_id=0)
+    tt_output_res = tt_resnet50(reshard_out, device, ops_parallel_config)
+    reshard_out = ttnn.allocate_tensor_on_device(
+        reshard_out.shape, reshard_out.dtype, reshard_out.layout, device, input_mem_config
+    )
+    ttnn.end_trace_capture(device, tid, cq_id=0)
+    assert first_out_addr == ttnn.buffer_address(reshard_out)
+    ttnn.dump_device_profiler(device)
+
+    for iter in range(0, num_warmup_iterations):
+        ttnn.wait_for_event(device, 1, op_event)
+        ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+        ttnn.record_event(device, 1, write_event)
+        ttnn.wait_for_event(device, 0, write_event)
+        reshard_out = ttnn.experimental.tensor.reshard(tt_image_res, input_mem_config, reshard_out)
+        ttnn.record_event(device, 0, op_event)
+        ttnn.execute_trace(device, tid, cq_id=0, blocking=True)
+        ttnn.dump_device_profiler(device)
+
+    ttnn.synchronize_devices(device)
+    if use_signpost:
+        signpost(header="start")
+    outputs = []
+    profiler.start(f"run")
+    for iter in range(0, num_measurement_iterations):
+        ttnn.wait_for_event(device, 1, op_event)
+        ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
+        ttnn.record_event(device, 1, write_event)
+        ttnn.wait_for_event(device, 0, write_event)
+        # TODO: Add in place support to ttnn to_memory_config
+        reshard_out = ttnn.experimental.tensor.reshard(tt_image_res, input_mem_config, reshard_out)
+        ttnn.record_event(device, 0, op_event)
+        ttnn.execute_trace(device, tid, cq_id=0, blocking=False)
+        outputs.append(tt_output_res.cpu(blocking=False))
+    ttnn.synchronize_devices(device)
+    profiler.end(f"run")
+    if use_signpost:
+        signpost(header="stop")
+    ttnn.dump_device_profiler(device)
+
+
+def run_perf_resnet(
+    device_batch_size,
+    expected_inference_time,
+    expected_compile_time,
+    hf_cat_image_sample_input,
+    device,
+    model_version,
+    model_location_generator,
+):
+    profiler.clear()
+    disable_persistent_kernel_cache()
+    if device_batch_size <= 2:
+        pytest.skip("Batch size 1 and 2 are not supported with sharded data")
+    num_devices = 1 if isinstance(device, ttnn.Device) else device.get_num_devices()
+    batch_size = device_batch_size * num_devices
+    first_key = f"first_iter_batchsize{batch_size}"
+    second_key = f"second_iter_batchsize{batch_size}"
+    cpu_key = f"ref_key_batchsize{batch_size}"
+    model_name = "microsoft/resnet-50"
+
+    image = hf_cat_image_sample_input
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    inputs = image_processor(image, return_tensors="pt")
+
+    inputs = inputs["pixel_values"].bfloat16()
+    comments = f"{list(inputs.shape)[-2]}x{list(inputs.shape)[-1]}_batchsize{batch_size}"
+
+    inputs1 = inputs
+    for i in range(batch_size - 1):
+        inputs = torch.cat((inputs, inputs1), dim=0)
+
+    inputs_mesh_mapper = ttnn.ShardTensorToMesh(device, dim=0)
+    weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device)
+    output_mesh_composer = ttnn.ConcatMeshToTensor(device, dim=0)
+
+    torch_resnet50 = load_resnet50_model(model_location_generator)
+    torch_resnet50.eval()
+
+    parameters = preprocess_model_parameters(
+        initialize_model=lambda: torch_resnet50,
+        custom_preprocessor=create_custom_mesh_preprocessor(weights_mesh_mapper),
+        device=None,
+    )
+    torch_resnet50.to(torch.bfloat16)
+
+    tt_resnet50 = resnet50(
+        device=device,
+        parameters=parameters,
+        batch_size=device_batch_size,
+        model_config=model_config,
+        dealloc_input=True,
+        final_output_mem_config=ttnn.DRAM_MEMORY_CONFIG if "trace" in model_version else ttnn.L1_MEMORY_CONFIG,
+        mesh_mapper=weights_mesh_mapper,
+    )
+    ttnn.synchronize_devices(device)
+
+    num_warmup_iterations = 5
+    num_measurement_iterations = 15
+
+    with torch.no_grad():
+        profiler.start(cpu_key)
+        logits = torch_resnet50(inputs)
+        profiler.end(cpu_key)
+
+        tt_inputs = tt_resnet50.preprocessing(inputs, inputs_mesh_mapper)
+        if "resnet50_trace_2cqs" in model_version:
+            run_trace_2cq_model(
+                device,
+                tt_inputs,
+                tt_resnet50,
+                inputs_mesh_mapper,
+                output_mesh_composer,
+                num_warmup_iterations,
+                num_measurement_iterations,
+            )
+        elif "resnet50_2cqs" in model_version:
+            run_2cq_model(
+                device,
+                tt_inputs,
+                tt_resnet50,
+                inputs_mesh_mapper,
+                output_mesh_composer,
+                num_warmup_iterations,
+                num_measurement_iterations,
+            )
+        elif "resnet50_trace" in model_version:
+            run_trace_model(
+                device,
+                tt_inputs,
+                tt_resnet50,
+                inputs_mesh_mapper,
+                output_mesh_composer,
+                num_warmup_iterations,
+                num_measurement_iterations,
+            )
+        elif "resnet50" in model_version:
+            run_model(
+                device,
+                tt_inputs,
+                tt_resnet50,
+                inputs_mesh_mapper,
+                output_mesh_composer,
+                num_warmup_iterations,
+                num_measurement_iterations,
+            )
+        else:
+            assert False, f"Model version to run {model_version} not found"
+
+    first_iter_time = profiler.get(f"compile") + profiler.get(f"cache")
+
+    # ensuring inference time fluctuations is not noise
+    inference_time_avg = profiler.get("run") / num_measurement_iterations
+
+    cpu_time = profiler.get(cpu_key)
+    compile_time = first_iter_time - 2 * inference_time_avg
+    prep_perf_report(
+        model_name=f"ttnn_{model_version}_batch_size{batch_size}",
+        batch_size=batch_size,
+        inference_and_compile_time=first_iter_time,
+        inference_time=inference_time_avg,
+        expected_compile_time=expected_compile_time,
+        expected_inference_time=expected_inference_time,
+        comments=comments,
+        inference_time_cpu=cpu_time,
+    )
+
+    logger.info(f"{model_name} {comments} inference time (avg): {inference_time_avg}")
+    logger.info(f"{model_name} compile time: {compile_time}")
+
+
+@run_for_wormhole_b0()
+@pytest.mark.model_perf_t3000
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
+@pytest.mark.parametrize(
+    "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time",
+    (
+        (16, True, 0.0094, 60),
+        (16, False, 0.0230, 60),
+    ),
+    indirect=["enable_async_mode"],
+)
+def test_perf_t3000(
+    device_mesh,
+    use_program_cache,
+    device_batch_size,
+    expected_inference_time,
+    expected_compile_time,
+    hf_cat_image_sample_input,
+    enable_async_mode,
+    model_location_generator,
+):
+    mode = "async" if enable_async_mode else "sync"
+    run_perf_resnet(
+        device_batch_size,
+        expected_inference_time,
+        expected_compile_time,
+        hf_cat_image_sample_input,
+        device_mesh,
+        f"resnet50_{mode}",
+        model_location_generator,
+    )
+
+
+@run_for_wormhole_b0()
+@pytest.mark.model_perf_t3000
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1500000}], indirect=True)
+@pytest.mark.parametrize(
+    "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time",
+    (
+        (16, True, 0.0068, 60),
+        (16, False, 0.0111, 60),
+    ),
+    indirect=["enable_async_mode"],
+)
+def test_perf_trace_t3000(
+    device_mesh,
+    use_program_cache,
+    device_batch_size,
+    expected_inference_time,
+    expected_compile_time,
+    hf_cat_image_sample_input,
+    enable_async_mode,
+    model_location_generator,
+):
+    mode = "async" if enable_async_mode else "sync"
+    run_perf_resnet(
+        device_batch_size,
+        expected_inference_time,
+        expected_compile_time,
+        hf_cat_image_sample_input,
+        device_mesh,
+        f"resnet50_trace_{mode}",
+        model_location_generator,
+    )
+
+
+@run_for_wormhole_b0()
+@pytest.mark.model_perf_t3000
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_command_queues": 2}], indirect=True)
+@pytest.mark.parametrize(
+    "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time",
+    (
+        (16, True, 0.0105, 60),
+        (16, False, 0.0220, 60),
+    ),
+    indirect=["enable_async_mode"],
+)
+def test_perf_2cqs_t3000(
+    device_mesh,
+    use_program_cache,
+    device_batch_size,
+    expected_inference_time,
+    expected_compile_time,
+    hf_cat_image_sample_input,
+    enable_async_mode,
+    model_location_generator,
+):
+    mode = "async" if enable_async_mode else "sync"
+    run_perf_resnet(
+        device_batch_size,
+        expected_inference_time,
+        expected_compile_time,
+        hf_cat_image_sample_input,
+        device_mesh,
+        f"resnet50_2cqs_{mode}",
+        model_location_generator,
+    )
+
+
+@run_for_wormhole_b0()
+@pytest.mark.model_perf_t3000
+@pytest.mark.parametrize(
+    "device_params", [{"l1_small_size": 32768, "num_command_queues": 2, "trace_region_size": 1332224}], indirect=True
+)
+@pytest.mark.parametrize(
+    "device_batch_size, enable_async_mode, expected_inference_time, expected_compile_time",
+    (
+        (16, True, 0.0043, 60),
+        (16, False, 0.009, 60),
+    ),
+    indirect=["enable_async_mode"],
+)
+def test_perf_trace_2cqs_t3000(
+    device_mesh,
+    use_program_cache,
+    device_batch_size,
+    expected_inference_time,
+    expected_compile_time,
+    hf_cat_image_sample_input,
+    enable_async_mode,
+    model_location_generator,
+):
+    mode = "async" if enable_async_mode else "sync"
+    run_perf_resnet(
+        device_batch_size,
+        expected_inference_time,
+        expected_compile_time,
+        hf_cat_image_sample_input,
+        device_mesh,
+        f"resnet50_trace_2cqs_{mode}",
+        model_location_generator,
+    )
diff --git a/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py b/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py
index c0b836e44d9..8b6e972a86c 100644
--- a/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py
+++ b/models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py
@@ -21,11 +21,44 @@
     use_signpost = False
 
 
+def create_event(device):
+    event = []
+    if isinstance(device, ttnn.Device):
+        event.append(tt_lib.device.CreateEvent())
+    else:
+        for dev in device.get_device_ids():
+            event.append(tt_lib.device.CreateEvent())
+    return event
+
+
+def wait_for_event(device, cq_id, event):
+    if isinstance(device, ttnn.Device):
+        tt_lib.device.WaitForEvent(device, cq_id, event)
+    else:
+        for dev, eve in zip(device.get_device_ids(), event):
+            tt_lib.device.WaitForEvent(device.get_device(dev), cq_id, eve)
+
+
+def record_event(device, cq_id, event):
+    if isinstance(device, ttnn.Device):
+        tt_lib.device.RecordEvent(device, cq_id, event)
+    else:
+        for dev, eve in zip(device.get_device_ids(), event):
+            tt_lib.device.RecordEvent(device.get_device(dev), cq_id, eve)
+
+
+def buffer_address(tensor):
+    addr = []
+    for ten in ttnn.get_device_tensors(tensor):
+        addr.append(ten.buffer_address())
+    return addr
+
+
 # TODO: Create ttnn apis for these
-ttnn.create_event = tt_lib.device.CreateEvent
-ttnn.wait_for_event = tt_lib.device.WaitForEvent
-ttnn.record_event = tt_lib.device.RecordEvent
-ttnn.dump_device_profiler = tt_lib.device.DumpDeviceProfiler
+ttnn.create_event = create_event
+ttnn.wait_for_event = wait_for_event
+ttnn.record_event = record_event
+ttnn.buffer_address = buffer_address
 
 
 # TODO: Move these into Resnet model preprocessing/member functions
@@ -84,7 +117,7 @@ def setup_dram_sharded_input(device, tt_inputs, tt_resnet50, mesh_mapper, mesh_c
     "device_batch_size, act_dtype, weight_dtype, math_fidelity",
     ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),),
 )
-@pytest.mark.parametrize("enable_async", [True, False])
+@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True)
 def test_run_resnet50_inference(
     device_mesh,
     use_program_cache,
@@ -92,7 +125,7 @@ def test_run_resnet50_inference(
     act_dtype,
     weight_dtype,
     math_fidelity,
-    enable_async,
+    enable_async_mode,
     model_location_generator,
 ):
     if device_batch_size == 8:
@@ -100,10 +133,6 @@ def test_run_resnet50_inference(
     if is_wormhole_b0() and device_batch_size == 20:
         pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue")
 
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(enable_async)
-        device_mesh.get_device(device).enable_program_cache()
-
     inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0)
     weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh)
     output_mesh_composer = ttnn.ConcatMeshToTensor(device_mesh, dim=0)
@@ -146,10 +175,6 @@ def test_run_resnet50_inference(
         signpost(header="stop")
     test_infra.validate()
 
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(False)
-        device_mesh.get_device(device).disable_and_clear_program_cache()
-
 
 @run_for_wormhole_b0()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "trace_region_size": 800768}], indirect=True)
@@ -157,7 +182,7 @@ def test_run_resnet50_inference(
     "device_batch_size, act_dtype, weight_dtype, math_fidelity",
     ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),),
 )
-@pytest.mark.parametrize("enable_async", [True, False])
+@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True)
 def test_run_resnet50_trace_inference(
     device_mesh,
     use_program_cache,
@@ -165,7 +190,7 @@ def test_run_resnet50_trace_inference(
     act_dtype,
     weight_dtype,
     math_fidelity,
-    enable_async,
+    enable_async_mode,
     model_location_generator,
 ):
     if device_batch_size == 8:
@@ -173,10 +198,6 @@ def test_run_resnet50_trace_inference(
     if is_wormhole_b0() and device_batch_size == 20:
         pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue")
 
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(enable_async)
-        device_mesh.get_device(device).enable_program_cache()
-
     inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0)
     weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh)
     output_mesh_composer = ttnn.ConcatMeshToTensor(device_mesh, dim=0)
@@ -230,19 +251,14 @@ def test_run_resnet50_trace_inference(
         signpost(header="stop")
     test_infra.validate()
 
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(False)
-        device_mesh.get_device(device).disable_and_clear_program_cache()
 
-
-@pytest.mark.skip()
 @run_for_wormhole_b0()
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_hw_cqs": 2}], indirect=True)
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_command_queues": 2}], indirect=True)
 @pytest.mark.parametrize(
     "device_batch_size, act_dtype, weight_dtype, math_fidelity",
     ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),),
 )
-@pytest.mark.parametrize("enable_async", [True, False])
+@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True)
 def test_run_resnet50_2cqs_inference(
     device_mesh,
     use_program_cache,
@@ -250,16 +266,13 @@ def test_run_resnet50_2cqs_inference(
     act_dtype,
     weight_dtype,
     math_fidelity,
-    enable_async,
+    enable_async_mode,
     model_location_generator,
 ):
     if device_batch_size == 8:
         pytest.skip("Skipping batch size 8 due to memory config issue")
     if is_wormhole_b0() and device_batch_size == 20:
         pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue")
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(enable_async)
-        device_mesh.get_device(device).enable_program_cache()
 
     inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0)
     weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh)
@@ -287,8 +300,8 @@ def test_run_resnet50_2cqs_inference(
         output_mesh_composer,
     )
     tt_image_res = tt_inputs_host.to(device_mesh, sharded_mem_config_DRAM)
-    op_event = ttnn.create_event()
-    write_event = ttnn.create_event()
+    op_event = ttnn.create_event(device_mesh)
+    write_event = ttnn.create_event(device_mesh)
     # Initialize the op event so we can write
     ttnn.record_event(device_mesh, 0, op_event)
 
@@ -325,28 +338,23 @@ def test_run_resnet50_2cqs_inference(
         ttnn.record_event(device_mesh, 0, op_event)
         outputs.append(ttnn.from_device(test_infra.run(), blocking=False))
 
-    ttnn.synchronize_devices(device)
+    ttnn.synchronize_devices(device_mesh)
 
     if use_signpost:
         signpost(header="stop")
     for output in outputs:
         test_infra.validate(output)
 
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(False)
-        device_mesh.get_device(device).disable_and_clear_program_cache()
 
-
-@pytest.mark.skip()
 @run_for_wormhole_b0()
 @pytest.mark.parametrize(
-    "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_hw_cqs": 2}], indirect=True
+    "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_command_queues": 2}], indirect=True
 )
 @pytest.mark.parametrize(
     "device_batch_size, act_dtype, weight_dtype, math_fidelity",
     ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),),
 )
-@pytest.mark.parametrize("enable_async", [True, False])
+@pytest.mark.parametrize("enable_async_mode", [True, False], indirect=True)
 def test_run_resnet50_trace_2cqs_inference(
     device_mesh,
     use_program_cache,
@@ -354,16 +362,13 @@ def test_run_resnet50_trace_2cqs_inference(
     act_dtype,
     weight_dtype,
     math_fidelity,
-    enable_async,
+    enable_async_mode,
     model_location_generator,
 ):
     if device_batch_size == 8:
         pytest.skip("Skipping batch size 8 due to memory config issue")
     if is_wormhole_b0() and device_batch_size == 20:
         pytest.skip("Skipping batch size 20 for Wormhole B0 due to fitting issue")
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(enable_async)
-        device_mesh.get_device(device).enable_program_cache()
 
     inputs_mesh_mapper = ttnn.ShardTensorToMesh(device_mesh, dim=0)
     weights_mesh_mapper = ttnn.ReplicateTensorToMesh(device_mesh)
@@ -391,8 +396,8 @@ def test_run_resnet50_trace_2cqs_inference(
         output_mesh_composer,
     )
     tt_image_res = tt_inputs_host.to(device_mesh, sharded_mem_config_DRAM)
-    op_event = ttnn.create_event()
-    write_event = ttnn.create_event()
+    op_event = ttnn.create_event(device_mesh)
+    write_event = ttnn.create_event(device_mesh)
     # Initialize the op event so we can write
     ttnn.record_event(device_mesh, 0, op_event)
 
@@ -412,7 +417,7 @@ def test_run_resnet50_trace_2cqs_inference(
     ttnn.record_event(device_mesh, 1, write_event)
     ttnn.wait_for_event(device_mesh, 0, write_event)
     test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, input_mem_config)
-    first_out_addr = test_infra.input_tensor.buffer_address()
+    first_out_addr = ttnn.buffer_address(test_infra.input_tensor)
     ttnn.record_event(device_mesh, 0, op_event)
     test_infra.run()
     test_infra.validate()
@@ -434,7 +439,7 @@ def test_run_resnet50_trace_2cqs_inference(
         input_mem_config,
     )
     ttnn.end_trace_capture(device_mesh, tid, cq_id=0)
-    assert first_out_addr == test_infra.input_tensor.buffer_address()
+    assert first_out_addr == ttnn.buffer_address(test_infra.input_tensor)
     test_infra.validate()
 
     # More optimized run with caching
@@ -452,19 +457,10 @@ def test_run_resnet50_trace_2cqs_inference(
         )
         ttnn.record_event(device_mesh, 0, op_event)
         ttnn.execute_trace(device_mesh, tid, cq_id=0, blocking=False)
-        outputs.append(
-            ttnn.from_device(
-                test_infra.output_tensor, device=device_mesh, mesh_composer=output_mesh_composer, blocking=False
-            )
-        )
-
+        outputs.append(ttnn.from_device(test_infra.output_tensor, blocking=False))
     ttnn.synchronize_devices(device_mesh)
 
     if use_signpost:
         signpost(header="stop")
     for output in outputs:
         test_infra.validate(output)
-
-    for device in device_mesh.get_device_ids():
-        device_mesh.get_device(device).enable_async(False)
-        device_mesh.get_device(device).disable_and_clear_program_cache()
diff --git a/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py
index 7a84e96d980..df7fcf64bf5 100644
--- a/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py
+++ b/models/demos/ttnn_resnet/tests/test_perf_device_ttnn_resnet.py
@@ -4,10 +4,10 @@
 
 import pytest
 from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report
-from models.utility_functions import skip_for_grayskull
+from models.utility_functions import run_for_wormhole_b0
 
 
-@skip_for_grayskull(reason_str="Untested for Grayskull")
+@run_for_wormhole_b0()
 @pytest.mark.models_device_performance_bare_metal
 @pytest.mark.parametrize(
     "batch_size, test, expected_perf",
diff --git a/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py b/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py
index a133e3c1314..ee533095d44 100644
--- a/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py
+++ b/models/demos/ttnn_resnet/tests/test_perf_ttnn_resnet.py
@@ -16,7 +16,7 @@
 from models.utility_functions import (
     profiler,
     disable_persistent_kernel_cache,
-    skip_for_grayskull,
+    run_for_wormhole_b0,
 )
 
 from models.perf.perf_utils import prep_perf_report
@@ -69,7 +69,7 @@ def run_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_measure
         _ = ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=True)
         ttnn.dump_device_profiler(device)
 
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     if use_signpost:
         signpost(header="start")
     outputs = []
@@ -77,7 +77,7 @@ def run_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_measure
     for iter in range(0, num_measurement_iterations):
         tt_inputs = tt_inputs_host.to(device, input_mem_config)
         outputs.append(ttnn.from_device(tt_resnet50(tt_inputs, device, ops_parallel_config), blocking=False))
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     profiler.end(f"run")
     if use_signpost:
         signpost(header="stop")
@@ -125,7 +125,7 @@ def run_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_mea
         _ = ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=True)
         ttnn.dump_device_profiler(device)
 
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     if use_signpost:
         signpost(header="start")
     outputs = []
@@ -138,7 +138,7 @@ def run_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_mea
         reshard_out = ttnn.to_memory_config(tt_image_res, input_mem_config)
         ttnn.record_event(device, 0, op_event)
         outputs.append(ttnn.from_device(tt_resnet50(reshard_out, device, ops_parallel_config), blocking=False))
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     profiler.end(f"run")
     if use_signpost:
         signpost(header="stop")
@@ -178,7 +178,7 @@ def run_trace_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_m
         _ = ttnn.from_device(tt_output_res, blocking=True)
         ttnn.dump_device_profiler(device)
 
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     if use_signpost:
         signpost(header="start")
     outputs = []
@@ -187,7 +187,7 @@ def run_trace_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, num_m
         ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res)
         ttnn.execute_trace(device, tid, cq_id=0, blocking=False)
         outputs.append(ttnn.from_device(tt_output_res, blocking=False))
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     profiler.end(f"run")
     if use_signpost:
         signpost(header="stop")
@@ -255,7 +255,7 @@ def run_trace_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, n
         ttnn.execute_trace(device, tid, cq_id=0, blocking=True)
         ttnn.dump_device_profiler(device)
 
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     if use_signpost:
         signpost(header="start")
     outputs = []
@@ -270,7 +270,7 @@ def run_trace_2cq_model(device, tt_inputs, tt_resnet50, num_warmup_iterations, n
         ttnn.record_event(device, 0, op_event)
         ttnn.execute_trace(device, tid, cq_id=0, blocking=False)
         outputs.append(tt_output_res.cpu(blocking=False))
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     profiler.end(f"run")
     if use_signpost:
         signpost(header="stop")
@@ -322,7 +322,7 @@ def run_perf_resnet(
         dealloc_input=True,
         final_output_mem_config=ttnn.DRAM_MEMORY_CONFIG if "trace" in model_version else ttnn.L1_MEMORY_CONFIG,
     )
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
 
     num_warmup_iterations = 5
     num_measurement_iterations = 15
@@ -366,9 +366,9 @@ def run_perf_resnet(
     logger.info(f"{model_name} compile time: {compile_time}")
 
 
-@skip_for_grayskull(reason_str="Untested for Grayskull")
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
+@run_for_wormhole_b0()
 @pytest.mark.models_performance_bare_metal
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, expected_inference_time, expected_compile_time",
     ((16, 0.006, 25),),
@@ -393,15 +393,16 @@ def test_perf_bare_metal(
     )
 
 
-@skip_for_grayskull(reason_str="Untested for Grayskull")
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1500000}], indirect=True)
+@run_for_wormhole_b0()
 @pytest.mark.models_performance_bare_metal
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "trace_region_size": 1500000}], indirect=True)
 @pytest.mark.parametrize(
-    "batch_size, enable_async, expected_inference_time, expected_compile_time",
+    "batch_size, enable_async_mode, expected_inference_time, expected_compile_time",
     (
         (16, True, 0.005, 25),
         (16, False, 0.0046, 25),
     ),
+    indirect=["enable_async_mode"],
 )
 def test_perf_trace_bare_metal(
     device,
@@ -410,11 +411,10 @@ def test_perf_trace_bare_metal(
     expected_inference_time,
     expected_compile_time,
     hf_cat_image_sample_input,
-    enable_async,
+    enable_async_mode,
     model_location_generator,
 ):
-    device.enable_async(enable_async)
-    mode = "async" if enable_async else "sync"
+    mode = "async" if enable_async_mode else "sync"
     run_perf_resnet(
         batch_size,
         expected_inference_time,
@@ -424,12 +424,11 @@ def test_perf_trace_bare_metal(
         f"resnet50_trace_{mode}",
         model_location_generator,
     )
-    device.enable_async(False)
 
 
-@skip_for_grayskull(reason_str="Untested for Grayskull")
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2}], indirect=True)
+@run_for_wormhole_b0()
 @pytest.mark.models_performance_bare_metal
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, expected_inference_time, expected_compile_time",
     ((16, 0.0064, 25),),
@@ -454,11 +453,11 @@ def test_perf_2cqs_bare_metal(
     )
 
 
-@skip_for_grayskull(reason_str="Untested for Grayskull")
+@run_for_wormhole_b0()
+@pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize(
     "device_params", [{"l1_small_size": 32768, "num_hw_cqs": 2, "trace_region_size": 1332224}], indirect=True
 )
-@pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize(
     "batch_size, expected_inference_time, expected_compile_time",
     ((16, 0.004, 25),),
diff --git a/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py b/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py
index 57bf4dd5833..bc1ffff7649 100644
--- a/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py
+++ b/models/demos/ttnn_resnet/tests/test_ttnn_resnet50_performant.py
@@ -25,7 +25,6 @@
 ttnn.create_event = tt_lib.device.CreateEvent
 ttnn.wait_for_event = tt_lib.device.WaitForEvent
 ttnn.record_event = tt_lib.device.RecordEvent
-ttnn.dump_device_profiler = tt_lib.device.DumpDeviceProfiler
 
 
 # TODO: Move these into Resnet model preprocessing/member functions
@@ -249,7 +248,7 @@ def test_run_resnet50_2cqs_inference(
         test_infra.input_tensor = ttnn.to_memory_config(tt_image_res, input_mem_config)
         ttnn.record_event(device, 0, op_event)
         outputs.append(ttnn.from_device(test_infra.run(), blocking=False))
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     if use_signpost:
         signpost(header="stop")
     for output in outputs:
@@ -347,7 +346,7 @@ def test_run_resnet50_trace_2cqs_inference(
     if use_signpost:
         signpost(header="start")
     outputs = []
-    for iter in range(0, 1):
+    for iter in range(0, 2):
         ttnn.wait_for_event(device, 1, op_event)
         ttnn.copy_host_to_device_tensor(tt_inputs_host, tt_image_res, 1)
         ttnn.record_event(device, 1, write_event)
@@ -360,7 +359,7 @@ def test_run_resnet50_trace_2cqs_inference(
         ttnn.execute_trace(device, tid, cq_id=0, blocking=False)
         outputs.append(ttnn.from_device(test_infra.output_tensor, blocking=False))
 
-    ttnn.device.synchronize_device(device)
+    ttnn.synchronize_device(device)
     if use_signpost:
         signpost(header="stop")
     for output in outputs:
diff --git a/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py b/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py
index b83f3cfed31..9aa74461298 100644
--- a/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py
+++ b/models/demos/ttnn_resnet/tests/ttnn_resnet_test_infra.py
@@ -113,7 +113,7 @@ def load_resnet50_model(model_location_generator):
             ttnn.MathFidelity.LoFi,
             ttnn.bfloat8_b,
             ttnn.bfloat8_b,
-        ): 0.884609,  # Max ATOL Delta: 6.455164909362793, Max RTOL Delta: inf, PCC: 0.8846098380419435
+        ): 0.988,  # Max ATOL Delta: 6.455164909362793, Max RTOL Delta: inf, PCC: 0.8846098380419435
     },
     20: {
         (
@@ -135,6 +135,19 @@ def load_resnet50_model(model_location_generator):
     },
 }
 
+golden_pcc = {
+    ttnn.device.Arch.WORMHOLE_B0: golden_pcc,
+    ttnn.device.Arch.GRAYSKULL: golden_pcc,
+}
+
+golden_pcc[ttnn.device.Arch.GRAYSKULL][16][
+    (
+        ttnn.MathFidelity.LoFi,
+        ttnn.bfloat8_b,
+        ttnn.bfloat8_b,
+    )
+] = 0.936
+
 
 class ResNet50TestInfra:
     def __init__(
@@ -227,7 +240,9 @@ def validate(self, output_tensor=None):
 
         valid_pcc = 1.0
         if self.batch_size >= 8:
-            valid_pcc = golden_pcc[self.batch_size][(self.math_fidelity, self.weight_dtype, self.act_dtype)]
+            valid_pcc = golden_pcc[self.device.arch()][self.batch_size][
+                (self.math_fidelity, self.weight_dtype, self.act_dtype)
+            ]
         else:
             if self.act_dtype == ttnn.bfloat8_b:
                 if self.math_fidelity == ttnn.MathFidelity.LoFi:
@@ -239,7 +254,6 @@ def validate(self, output_tensor=None):
                     valid_pcc = 0.93
                 else:
                     valid_pcc = 0.982
-        print(valid_pcc)
         self.pcc_passed, self.pcc_message = assert_with_pcc(self.torch_output_tensor, output_tensor, pcc=valid_pcc)
 
         logger.info(
diff --git a/models/perf/perf_utils.py b/models/perf/perf_utils.py
index 92cc1cc9bf5..3f8e4edfc32 100644
--- a/models/perf/perf_utils.py
+++ b/models/perf/perf_utils.py
@@ -45,7 +45,7 @@ def merge_perf_files(fname, perf_fname, expected_cols):
 
 def process_perf_results(fname, expected_cols):
     with open(fname) as file:
-        merge_res = csv.reader(file)
+        merge_res = csv.reader(file, skipinitialspace=True)
         logger.info(next(merge_res)[0].strip())
         logger.info(next(merge_res)[0].strip())
         cols = next(merge_res)
diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh
index 3ff50232deb..b96a360ce86 100755
--- a/tests/scripts/t3000/run_t3000_frequent_tests.sh
+++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh
@@ -124,6 +124,24 @@ run_t3000_falcon40b_tests() {
   fi
 }
 
+run_t3000_resnet_tests() {
+  fail=0
+  # Record the start time
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_resnet_tests"
+
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/ttnn_resnet/tests/multi_device/test_ttnn_resnet50_performant.py ; fail+=$?
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_resnet_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
+}
+
 run_t3000_tests() {
   # Run ethernet tests
   run_t3000_ethernet_tests
@@ -143,6 +161,9 @@ run_t3000_tests() {
   # Run mixtral tests
   run_t3000_mixtral_tests
 
+  # Run resnet tests
+  run_t3000_resnet_tests
+
   # Run trace tests
   run_t3000_trace_stress_tests
 
diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
index 4af78368f90..27b5a0c3ce4 100755
--- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh
+++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
@@ -74,6 +74,24 @@ run_t3000_falcon40b_tests() {
   fi
 }
 
+run_t3000_resnet50_tests() {
+  # Record the start time
+  fail=0
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_resnet50_tests"
+
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/ttnn_resnet/tests/multi_device/test_perf_ttnn_resnet.py -m "model_perf_t3000" ; fail+=$?
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_resnet50_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
+}
+
 run_t3000_llm_tests() {
   # Run falcon7b tests
   run_t3000_falcon7b_tests
@@ -92,6 +110,9 @@ run_t3000_llm_tests() {
 }
 
 run_t3000_cnn_tests() {
+  # Run resnet50 tests
+  run_t3000_resnet50_tests
+
   # Merge all the generated reports
   env python models/perf/merge_perf_results.py
 }
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp
index c8830d081ee..27b77624921 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/sharded/sharded_op.hpp
@@ -192,9 +192,9 @@ struct Reshard {
 inline Tensor reshard(const Tensor &input_tensor, const MemoryConfig &output_mem_config, std::optional<Tensor> output_tensor = std::nullopt) {
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
     operation::launch_op(
-        [output_mem_config, output_tensor] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
+        [output_mem_config] (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
             const auto& input_tensor = input_tensors.at(0);
-            return operation::run(Reshard{.output_mem_config = output_mem_config,}, {input_tensor}, {}, {output_tensor});
+            return operation::run(Reshard{.output_mem_config = output_mem_config,}, {input_tensor}, {}, {optional_output_tensors});
         }, {input_tensor}, output_tensors, {}, {output_tensor});
     return output_tensors.at(0);
 }